diff --git a/.gitattributes b/.gitattributes
index 8a5dbf9b880f1b83d9812acbbd89ce7999b27b19..9e3bd09e030501d2c64e03fa740a7bda429ef395 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -421,3 +421,4 @@ tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/lib/
 .venv/lib/python3.11/site-packages/idna/__pycache__/idnadata.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
 .venv/lib/python3.11/site-packages/pkg_resources/_vendor/pyparsing/__pycache__/core.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
 .venv/lib/python3.11/site-packages/pkg_resources/__pycache__/__init__.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
+.venv/lib/python3.11/site-packages/pkg_resources/_vendor/more_itertools/__pycache__/more.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
diff --git a/.venv/lib/python3.11/site-packages/pkg_resources/_vendor/more_itertools/__pycache__/more.cpython-311.pyc b/.venv/lib/python3.11/site-packages/pkg_resources/_vendor/more_itertools/__pycache__/more.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..662bd9e75e176e272b1a78f33ebd52c12e6958fb
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/pkg_resources/_vendor/more_itertools/__pycache__/more.cpython-311.pyc
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c2915b1a6eb03bcc7f50087391a236782ed9422d5e5eb9ed0a937ca85ce5e73a
+size 167956
diff --git a/.venv/lib/python3.11/site-packages/transformers/data/__init__.py b/.venv/lib/python3.11/site-packages/transformers/data/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8b675aae281f3220e305461ee955f52815b7fc03
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/data/__init__.py
@@ -0,0 +1,45 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .data_collator import (
+    DataCollatorForLanguageModeling,
+    DataCollatorForPermutationLanguageModeling,
+    DataCollatorForSeq2Seq,
+    DataCollatorForSOP,
+    DataCollatorForTokenClassification,
+    DataCollatorForWholeWordMask,
+    DataCollatorWithFlattening,
+    DataCollatorWithPadding,
+    DefaultDataCollator,
+    default_data_collator,
+)
+from .metrics import glue_compute_metrics, xnli_compute_metrics
+from .processors import (
+    DataProcessor,
+    InputExample,
+    InputFeatures,
+    SingleSentenceClassificationProcessor,
+    SquadExample,
+    SquadFeatures,
+    SquadV1Processor,
+    SquadV2Processor,
+    glue_convert_examples_to_features,
+    glue_output_modes,
+    glue_processors,
+    glue_tasks_num_labels,
+    squad_convert_examples_to_features,
+    xnli_output_modes,
+    xnli_processors,
+    xnli_tasks_num_labels,
+)
diff --git a/.venv/lib/python3.11/site-packages/transformers/data/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/data/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d516b400e27b1a4dec7ad3c913b129a92642a0ea
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/data/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/data/__pycache__/data_collator.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/data/__pycache__/data_collator.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..356d3143bd10314f26381d87ddde3b3f9e7a0526
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/data/__pycache__/data_collator.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/data/data_collator.py b/.venv/lib/python3.11/site-packages/transformers/data/data_collator.py
new file mode 100644
index 0000000000000000000000000000000000000000..e84c9d0ef3ce2e7191f6ba7c6d5da209d559b91d
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/data/data_collator.py
@@ -0,0 +1,1656 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import random
+import warnings
+from collections.abc import Mapping
+from dataclasses import dataclass
+from random import randint
+from typing import Any, Callable, Dict, List, NewType, Optional, Tuple, Union
+
+import numpy as np
+
+from ..models.bert import BertTokenizer, BertTokenizerFast
+from ..tokenization_utils_base import PreTrainedTokenizerBase
+from ..utils import PaddingStrategy
+
+
+InputDataClass = NewType("InputDataClass", Any)
+
+"""
+A DataCollator is a function that takes a list of samples from a Dataset and collate them into a batch, as a dictionary
+of PyTorch/TensorFlow tensors or NumPy arrays.
+"""
+DataCollator = NewType("DataCollator", Callable[[List[InputDataClass]], Dict[str, Any]])
+
+
+class DataCollatorMixin:
+    def __call__(self, features, return_tensors=None):
+        if return_tensors is None:
+            return_tensors = self.return_tensors
+        if return_tensors == "tf":
+            return self.tf_call(features)
+        elif return_tensors == "pt":
+            return self.torch_call(features)
+        elif return_tensors == "np":
+            return self.numpy_call(features)
+        else:
+            raise ValueError(f"Framework '{return_tensors}' not recognized!")
+
+
+def pad_without_fast_tokenizer_warning(tokenizer, *pad_args, **pad_kwargs):
+    """
+    Pads without triggering the warning about how using the pad function is sub-optimal when using a fast tokenizer.
+    """
+
+    # To avoid errors when using Feature extractors
+    if not hasattr(tokenizer, "deprecation_warnings"):
+        return tokenizer.pad(*pad_args, **pad_kwargs)
+
+    # Save the state of the warning, then disable it
+    warning_state = tokenizer.deprecation_warnings.get("Asking-to-pad-a-fast-tokenizer", False)
+    tokenizer.deprecation_warnings["Asking-to-pad-a-fast-tokenizer"] = True
+
+    try:
+        padded = tokenizer.pad(*pad_args, **pad_kwargs)
+    finally:
+        # Restore the state of the warning.
+        tokenizer.deprecation_warnings["Asking-to-pad-a-fast-tokenizer"] = warning_state
+
+    return padded
+
+
+def default_data_collator(features: List[InputDataClass], return_tensors="pt") -> Dict[str, Any]:
+    """
+    Very simple data collator that simply collates batches of dict-like objects and performs special handling for
+    potential keys named:
+
+        - `label`: handles a single value (int or float) per object
+        - `label_ids`: handles a list of values per object
+
+    Does not do any additional preprocessing: property names of the input object will be used as corresponding inputs
+    to the model. See glue and ner for example of how it's useful.
+    """
+
+    # In this function we'll make the assumption that all `features` in the batch
+    # have the same attributes.
+    # So we will look at the first element as a proxy for what attributes exist
+    # on the whole batch.
+
+    if return_tensors == "pt":
+        return torch_default_data_collator(features)
+    elif return_tensors == "tf":
+        return tf_default_data_collator(features)
+    elif return_tensors == "np":
+        return numpy_default_data_collator(features)
+
+
+@dataclass
+class DefaultDataCollator(DataCollatorMixin):
+    """
+    Very simple data collator that simply collates batches of dict-like objects and performs special handling for
+    potential keys named:
+
+        - `label`: handles a single value (int or float) per object
+        - `label_ids`: handles a list of values per object
+
+    Does not do any additional preprocessing: property names of the input object will be used as corresponding inputs
+    to the model. See glue and ner for example of how it's useful.
+
+    This is an object (like other data collators) rather than a pure function like default_data_collator. This can be
+    helpful if you need to set a return_tensors value at initialization.
+
+    Args:
+        return_tensors (`str`, *optional*, defaults to `"pt"`):
+            The type of Tensor to return. Allowable values are "np", "pt" and "tf".
+    """
+
+    return_tensors: str = "pt"
+
+    def __call__(self, features: List[Dict[str, Any]], return_tensors=None) -> Dict[str, Any]:
+        if return_tensors is None:
+            return_tensors = self.return_tensors
+        return default_data_collator(features, return_tensors)
+
+
+def torch_default_data_collator(features: List[InputDataClass]) -> Dict[str, Any]:
+    import torch
+
+    if not isinstance(features[0], Mapping):
+        features = [vars(f) for f in features]
+    first = features[0]
+    batch = {}
+
+    # Special handling for labels.
+    # Ensure that tensor is created with the correct type
+    # (it should be automatically the case, but let's make sure of it.)
+    if "label" in first and first["label"] is not None:
+        label = first["label"].item() if isinstance(first["label"], torch.Tensor) else first["label"]
+        dtype = torch.long if isinstance(label, int) else torch.float
+        batch["labels"] = torch.tensor([f["label"] for f in features], dtype=dtype)
+    elif "label_ids" in first and first["label_ids"] is not None:
+        if isinstance(first["label_ids"], torch.Tensor):
+            batch["labels"] = torch.stack([f["label_ids"] for f in features])
+        else:
+            dtype = torch.long if isinstance(first["label_ids"][0], int) else torch.float
+            batch["labels"] = torch.tensor([f["label_ids"] for f in features], dtype=dtype)
+
+    # Handling of all other possible keys.
+    # Again, we will use the first element to figure out which key/values are not None for this model.
+    for k, v in first.items():
+        if k not in ("label", "label_ids") and v is not None and not isinstance(v, str):
+            if isinstance(v, torch.Tensor):
+                batch[k] = torch.stack([f[k] for f in features])
+            elif isinstance(v, np.ndarray):
+                batch[k] = torch.from_numpy(np.stack([f[k] for f in features]))
+            else:
+                batch[k] = torch.tensor([f[k] for f in features])
+
+    return batch
+
+
+def tf_default_data_collator(features: List[InputDataClass]) -> Dict[str, Any]:
+    import tensorflow as tf
+
+    if not isinstance(features[0], Mapping):
+        features = [vars(f) for f in features]
+    first = features[0]
+    batch = {}
+
+    # Special handling for labels.
+    # Ensure that tensor is created with the correct type
+    # (it should be automatically the case, but let's make sure of it.)
+    if "label" in first and first["label"] is not None:
+        label_col_name = "label"
+    elif "label_ids" in first and first["label_ids"] is not None:
+        label_col_name = "label_ids"
+    elif "labels" in first and first["labels"] is not None:
+        label_col_name = "labels"
+    else:
+        label_col_name = None
+    if label_col_name is not None:
+        if isinstance(first[label_col_name], tf.Tensor):
+            dtype = tf.int64 if first[label_col_name].dtype.is_integer else tf.float32
+        elif isinstance(first[label_col_name], np.ndarray) or isinstance(first[label_col_name], np.generic):
+            dtype = tf.int64 if np.issubdtype(first[label_col_name].dtype, np.integer) else tf.float32
+        elif isinstance(first[label_col_name], (tuple, list)):
+            dtype = tf.int64 if isinstance(first[label_col_name][0], int) else tf.float32
+        else:
+            dtype = tf.int64 if isinstance(first[label_col_name], int) else tf.float32
+        batch["labels"] = tf.convert_to_tensor([f[label_col_name] for f in features], dtype=dtype)
+    # Handling of all other possible keys.
+    # Again, we will use the first element to figure out which key/values are not None for this model.
+    for k, v in first.items():
+        if k not in ("label", "label_ids", "labels") and v is not None and not isinstance(v, str):
+            if isinstance(v, (tf.Tensor, np.ndarray)):
+                batch[k] = tf.stack([f[k] for f in features])
+            else:
+                batch[k] = tf.convert_to_tensor([f[k] for f in features])
+
+    return batch
+
+
+def numpy_default_data_collator(features: List[InputDataClass]) -> Dict[str, Any]:
+    if not isinstance(features[0], Mapping):
+        features = [vars(f) for f in features]
+    first = features[0]
+    batch = {}
+
+    # Special handling for labels.
+    # Ensure that tensor is created with the correct type
+    # (it should be automatically the case, but let's make sure of it.)
+    if "label" in first and first["label"] is not None:
+        label = first["label"].item() if isinstance(first["label"], np.ndarray) else first["label"]
+        dtype = np.int64 if isinstance(label, int) else np.float32
+        batch["labels"] = np.array([f["label"] for f in features], dtype=dtype)
+    elif "label_ids" in first and first["label_ids"] is not None:
+        if isinstance(first["label_ids"], np.ndarray):
+            batch["labels"] = np.stack([f["label_ids"] for f in features])
+        else:
+            dtype = np.int64 if isinstance(first["label_ids"][0], int) else np.float32
+            batch["labels"] = np.array([f["label_ids"] for f in features], dtype=dtype)
+
+    # Handling of all other possible keys.
+    # Again, we will use the first element to figure out which key/values are not None for this model.
+    for k, v in first.items():
+        if k not in ("label", "label_ids") and v is not None and not isinstance(v, str):
+            if isinstance(v, np.ndarray):
+                batch[k] = np.stack([f[k] for f in features])
+            else:
+                batch[k] = np.array([f[k] for f in features])
+
+    return batch
+
+
+@dataclass
+class DataCollatorWithPadding:
+    """
+    Data collator that will dynamically pad the inputs received.
+
+    Args:
+        tokenizer ([`PreTrainedTokenizer`] or [`PreTrainedTokenizerFast`]):
+            The tokenizer used for encoding the data.
+        padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`):
+            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
+            among:
+
+            - `True` or `'longest'` (default): Pad to the longest sequence in the batch (or no padding if only a single
+              sequence is provided).
+            - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
+              acceptable input length for the model if that argument is not provided.
+            - `False` or `'do_not_pad'`: No padding (i.e., can output a batch with sequences of different lengths).
+        max_length (`int`, *optional*):
+            Maximum length of the returned list and optionally padding length (see above).
+        pad_to_multiple_of (`int`, *optional*):
+            If set will pad the sequence to a multiple of the provided value.
+
+            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
+            7.0 (Volta).
+        return_tensors (`str`, *optional*, defaults to `"pt"`):
+            The type of Tensor to return. Allowable values are "np", "pt" and "tf".
+    """
+
+    tokenizer: PreTrainedTokenizerBase
+    padding: Union[bool, str, PaddingStrategy] = True
+    max_length: Optional[int] = None
+    pad_to_multiple_of: Optional[int] = None
+    return_tensors: str = "pt"
+
+    def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
+        batch = pad_without_fast_tokenizer_warning(
+            self.tokenizer,
+            features,
+            padding=self.padding,
+            max_length=self.max_length,
+            pad_to_multiple_of=self.pad_to_multiple_of,
+            return_tensors=self.return_tensors,
+        )
+        if "label" in batch:
+            batch["labels"] = batch["label"]
+            del batch["label"]
+        if "label_ids" in batch:
+            batch["labels"] = batch["label_ids"]
+            del batch["label_ids"]
+        return batch
+
+
+@dataclass
+class DataCollatorForTokenClassification(DataCollatorMixin):
+    """
+    Data collator that will dynamically pad the inputs received, as well as the labels.
+
+    Args:
+        tokenizer ([`PreTrainedTokenizer`] or [`PreTrainedTokenizerFast`]):
+            The tokenizer used for encoding the data.
+        padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`):
+            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
+            among:
+
+            - `True` or `'longest'` (default): Pad to the longest sequence in the batch (or no padding if only a single
+              sequence is provided).
+            - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
+              acceptable input length for the model if that argument is not provided.
+            - `False` or `'do_not_pad'`: No padding (i.e., can output a batch with sequences of different lengths).
+        max_length (`int`, *optional*):
+            Maximum length of the returned list and optionally padding length (see above).
+        pad_to_multiple_of (`int`, *optional*):
+            If set will pad the sequence to a multiple of the provided value.
+
+            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
+            7.0 (Volta).
+        label_pad_token_id (`int`, *optional*, defaults to -100):
+            The id to use when padding the labels (-100 will be automatically ignore by PyTorch loss functions).
+        return_tensors (`str`, *optional*, defaults to `"pt"`):
+            The type of Tensor to return. Allowable values are "np", "pt" and "tf".
+    """
+
+    tokenizer: PreTrainedTokenizerBase
+    padding: Union[bool, str, PaddingStrategy] = True
+    max_length: Optional[int] = None
+    pad_to_multiple_of: Optional[int] = None
+    label_pad_token_id: int = -100
+    return_tensors: str = "pt"
+
+    def torch_call(self, features):
+        import torch
+
+        label_name = "label" if "label" in features[0].keys() else "labels"
+        labels = [feature[label_name] for feature in features] if label_name in features[0].keys() else None
+
+        no_labels_features = [{k: v for k, v in feature.items() if k != label_name} for feature in features]
+
+        batch = pad_without_fast_tokenizer_warning(
+            self.tokenizer,
+            no_labels_features,
+            padding=self.padding,
+            max_length=self.max_length,
+            pad_to_multiple_of=self.pad_to_multiple_of,
+            return_tensors="pt",
+        )
+
+        if labels is None:
+            return batch
+
+        sequence_length = batch["input_ids"].shape[1]
+        padding_side = self.tokenizer.padding_side
+
+        def to_list(tensor_or_iterable):
+            if isinstance(tensor_or_iterable, torch.Tensor):
+                return tensor_or_iterable.tolist()
+            return list(tensor_or_iterable)
+
+        if padding_side == "right":
+            batch[label_name] = [
+                to_list(label) + [self.label_pad_token_id] * (sequence_length - len(label)) for label in labels
+            ]
+        else:
+            batch[label_name] = [
+                [self.label_pad_token_id] * (sequence_length - len(label)) + to_list(label) for label in labels
+            ]
+
+        batch[label_name] = torch.tensor(batch[label_name], dtype=torch.int64)
+        return batch
+
+    def tf_call(self, features):
+        import tensorflow as tf
+
+        label_name = "label" if "label" in features[0].keys() else "labels"
+        labels = [feature[label_name] for feature in features] if label_name in features[0].keys() else None
+        batch = pad_without_fast_tokenizer_warning(
+            self.tokenizer,
+            features,
+            padding=self.padding,
+            max_length=self.max_length,
+            pad_to_multiple_of=self.pad_to_multiple_of,
+            # Conversion to tensors will fail if we have labels as they are not of the same length yet.
+            return_tensors="tf" if labels is None else None,
+        )
+
+        if labels is None:
+            return batch
+
+        sequence_length = tf.convert_to_tensor(batch["input_ids"]).shape[1]
+        padding_side = self.tokenizer.padding_side
+        if padding_side == "right":
+            batch["labels"] = [
+                list(label) + [self.label_pad_token_id] * (sequence_length - len(label)) for label in labels
+            ]
+        else:
+            batch["labels"] = [
+                [self.label_pad_token_id] * (sequence_length - len(label)) + list(label) for label in labels
+            ]
+
+        batch = {k: tf.convert_to_tensor(v, dtype=tf.int64) for k, v in batch.items()}
+        return batch
+
+    def numpy_call(self, features):
+        label_name = "label" if "label" in features[0].keys() else "labels"
+        labels = [feature[label_name] for feature in features] if label_name in features[0].keys() else None
+        batch = pad_without_fast_tokenizer_warning(
+            self.tokenizer,
+            features,
+            padding=self.padding,
+            max_length=self.max_length,
+            pad_to_multiple_of=self.pad_to_multiple_of,
+            # Conversion to tensors will fail if we have labels as they are not of the same length yet.
+            return_tensors="np" if labels is None else None,
+        )
+
+        if labels is None:
+            return batch
+
+        sequence_length = np.array(batch["input_ids"]).shape[1]
+        padding_side = self.tokenizer.padding_side
+        if padding_side == "right":
+            batch["labels"] = [
+                list(label) + [self.label_pad_token_id] * (sequence_length - len(label)) for label in labels
+            ]
+        else:
+            batch["labels"] = [
+                [self.label_pad_token_id] * (sequence_length - len(label)) + list(label) for label in labels
+            ]
+
+        batch = {k: np.array(v, dtype=np.int64) for k, v in batch.items()}
+        return batch
+
+
+def _torch_collate_batch(examples, tokenizer, pad_to_multiple_of: Optional[int] = None):
+    """Collate `examples` into a batch, using the information in `tokenizer` for padding if necessary."""
+    import torch
+
+    # Tensorize if necessary.
+    if isinstance(examples[0], (list, tuple, np.ndarray)):
+        examples = [torch.tensor(e, dtype=torch.long) for e in examples]
+
+    length_of_first = examples[0].size(0)
+
+    # Check if padding is necessary.
+
+    are_tensors_same_length = all(x.size(0) == length_of_first for x in examples)
+    if are_tensors_same_length and (pad_to_multiple_of is None or length_of_first % pad_to_multiple_of == 0):
+        if not isinstance(examples, torch.Tensor):
+            return torch.stack(examples, dim=0)
+
+    # If yes, check if we have a `pad_token`.
+    if tokenizer.pad_token is None:
+        raise ValueError(
+            "You are attempting to pad samples but the tokenizer you are using"
+            f" ({tokenizer.__class__.__name__}) does not have a pad token."
+        )
+
+    # Creating the full tensor and filling it with our data.
+    max_length = max(x.size(0) for x in examples)
+    if pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
+        max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
+    result = examples[0].new_full([len(examples), max_length], tokenizer.pad_token_id)
+    for i, example in enumerate(examples):
+        if tokenizer.padding_side == "right":
+            result[i, : example.shape[0]] = example
+        else:
+            result[i, -example.shape[0] :] = example
+    return result
+
+
+def _tf_collate_batch(examples, tokenizer, pad_to_multiple_of: Optional[int] = None):
+    import tensorflow as tf
+
+    """Collate `examples` into a batch, using the information in `tokenizer` for padding if necessary."""
+    # Tensorize if necessary.
+    if isinstance(examples[0], (list, tuple)):
+        examples = [tf.convert_to_tensor(e, dtype=tf.int64) for e in examples]
+
+    # Check if padding is necessary.
+    length_of_first = len(examples[0])
+    are_tensors_same_length = all(len(x) == length_of_first for x in examples)
+    if are_tensors_same_length and (pad_to_multiple_of is None or length_of_first % pad_to_multiple_of == 0):
+        return tf.stack(examples, axis=0)
+
+    # If yes, check if we have a `pad_token`.
+    if tokenizer.pad_token is None:
+        raise ValueError(
+            "You are attempting to pad samples but the tokenizer you are using"
+            f" ({tokenizer.__class__.__name__}) does not have a pad token."
+        )
+
+    # Creating the full tensor and filling it with our data.
+    max_length = max(len(x) for x in examples)
+    if pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
+        max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
+    # result = examples[0].new_full([len(examples), max_length], tokenizer.pad_token_id)
+    result = []
+    rank = tf.rank(examples[0])
+    paddings = np.zeros((rank, 2), dtype=np.int32)
+    for example in examples:
+        if tokenizer.padding_side == "right":
+            paddings[0, 1] = max_length - len(example)
+        else:
+            paddings[0, 0] = max_length - len(example)
+        result.append(tf.pad(example, paddings, constant_values=tokenizer.pad_token_id))
+    return tf.stack(result, axis=0)
+
+
+def _numpy_collate_batch(examples, tokenizer, pad_to_multiple_of: Optional[int] = None):
+    """Collate `examples` into a batch, using the information in `tokenizer` for padding if necessary."""
+    # Tensorize if necessary.
+    if isinstance(examples[0], (list, tuple)):
+        examples = [np.array(e, dtype=np.int64) for e in examples]
+
+    # Check if padding is necessary.
+    length_of_first = len(examples[0])
+    are_tensors_same_length = all(len(x) == length_of_first for x in examples)
+    if are_tensors_same_length and (pad_to_multiple_of is None or length_of_first % pad_to_multiple_of == 0):
+        return np.stack(examples, axis=0)
+
+    # If yes, check if we have a `pad_token`.
+    if tokenizer.pad_token is None:
+        raise ValueError(
+            "You are attempting to pad samples but the tokenizer you are using"
+            f" ({tokenizer.__class__.__name__}) does not have a pad token."
+        )
+
+    # Creating the full tensor and filling it with our data.
+    max_length = max(len(x) for x in examples)
+    if pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
+        max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
+    result = np.full(shape=(len(examples), max_length), fill_value=tokenizer.pad_token_id, dtype=examples[0].dtype)
+    for i, example in enumerate(examples):
+        if tokenizer.padding_side == "right":
+            result[i, : example.shape[0]] = example
+        else:
+            result[i, -example.shape[0] :] = example
+    return result
+
+
+def tolist(x):
+    if isinstance(x, list):
+        return x
+    elif hasattr(x, "numpy"):  # Checks for TF tensors without needing the import
+        x = x.numpy()
+    return x.tolist()
+
+
+@dataclass
+class DataCollatorForSeq2Seq:
+    """
+    Data collator that will dynamically pad the inputs received, as well as the labels.
+
+    Args:
+        tokenizer ([`PreTrainedTokenizer`] or [`PreTrainedTokenizerFast`]):
+            The tokenizer used for encoding the data.
+        model ([`PreTrainedModel`], *optional*):
+            The model that is being trained. If set and has the *prepare_decoder_input_ids_from_labels*, use it to
+            prepare the *decoder_input_ids*
+
+            This is useful when using *label_smoothing* to avoid calculating loss twice.
+        padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`):
+            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
+            among:
+
+            - `True` or `'longest'` (default): Pad to the longest sequence in the batch (or no padding if only a single
+              sequence is provided).
+            - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
+              acceptable input length for the model if that argument is not provided.
+            - `False` or `'do_not_pad'`: No padding (i.e., can output a batch with sequences of different lengths).
+        max_length (`int`, *optional*):
+            Maximum length of the returned list and optionally padding length (see above).
+        pad_to_multiple_of (`int`, *optional*):
+            If set will pad the sequence to a multiple of the provided value.
+
+            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
+            7.0 (Volta).
+        label_pad_token_id (`int`, *optional*, defaults to -100):
+            The id to use when padding the labels (-100 will be automatically ignored by PyTorch loss functions).
+        return_tensors (`str`, *optional*, defaults to `"pt"`):
+            The type of Tensor to return. Allowable values are "np", "pt" and "tf".
+    """
+
+    tokenizer: PreTrainedTokenizerBase
+    model: Optional[Any] = None
+    padding: Union[bool, str, PaddingStrategy] = True
+    max_length: Optional[int] = None
+    pad_to_multiple_of: Optional[int] = None
+    label_pad_token_id: int = -100
+    return_tensors: str = "pt"
+
+    def __call__(self, features, return_tensors=None):
+        if return_tensors is None:
+            return_tensors = self.return_tensors
+
+        label_name = "label" if "label" in features[0].keys() else "labels"
+        labels = [feature[label_name] for feature in features] if label_name in features[0].keys() else None
+        # reconvert list[None] to None if necessary
+        # this might occur when we pass {..., "labels": None}
+        if labels is not None and all(label is None for label in labels):
+            labels = None
+        non_labels_features = [{k: v for k, v in feature.items() if k != label_name} for feature in features]
+
+        # run through tokenizer without labels to ensure no side effects
+        batch = pad_without_fast_tokenizer_warning(
+            self.tokenizer,
+            non_labels_features,
+            padding=self.padding,
+            max_length=self.max_length,
+            pad_to_multiple_of=self.pad_to_multiple_of,
+            return_tensors=return_tensors,
+        )
+
+        # we have to pad the labels manually as we cannot rely on `tokenizer.pad` and we need them to be of the same length to return tensors
+        no_padding = self.padding is False or self.padding == PaddingStrategy.DO_NOT_PAD
+        if labels is not None:
+            if no_padding:
+                if isinstance(features[0][label_name], list):
+                    batch["labels"] = list(labels)
+                else:
+                    batch["labels"] = [np.concatenate([label, []]) for label in labels]
+            else:
+                max_padding = self.padding == PaddingStrategy.MAX_LENGTH and self.max_length is not None
+                max_label_length = max(len(l) for l in labels) if not max_padding else self.max_length
+                if self.pad_to_multiple_of is not None:
+                    max_label_length = (
+                        (max_label_length + self.pad_to_multiple_of - 1)
+                        // self.pad_to_multiple_of
+                        * self.pad_to_multiple_of
+                    )
+
+                padding_side = self.tokenizer.padding_side
+                if isinstance(features[0][label_name], list):
+                    batch["labels"] = [
+                        label + [self.label_pad_token_id] * (max_label_length - len(label))
+                        if padding_side == "right"
+                        else [self.label_pad_token_id] * (max_label_length - len(label)) + label
+                        for label in labels
+                    ]
+                else:
+                    batch["labels"] = [
+                        np.concatenate(
+                            [
+                                label,
+                                np.array([self.label_pad_token_id] * (max_label_length - len(label)), dtype=np.int64),
+                            ]
+                        )
+                        if padding_side == "right"
+                        else np.concatenate(
+                            [
+                                np.array([self.label_pad_token_id] * (max_label_length - len(label)), dtype=np.int64),
+                                label,
+                            ]
+                        )
+                        for label in labels
+                    ]
+
+        # reintroduce side effects via tokenizer that return respective datatypes for the `return_tensors` argument
+        if batch.get("labels", None) is not None:
+            if return_tensors == "pt":
+                import torch
+
+                batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)
+            elif return_tensors == "tf":
+                import tensorflow as tf
+
+                batch["labels"] = tf.constant(batch["labels"], dtype=tf.int64)
+            else:
+                batch["labels"] = np.array(batch["labels"], dtype=np.int64)
+        else:
+            batch["labels"] = None
+
+        # prepare decoder_input_ids
+        if (
+            labels is not None
+            and self.model is not None
+            and hasattr(self.model, "prepare_decoder_input_ids_from_labels")
+        ):
+            decoder_input_ids = self.model.prepare_decoder_input_ids_from_labels(labels=batch["labels"])
+            batch["decoder_input_ids"] = decoder_input_ids
+
+        return batch
+
+
+@dataclass
+class DataCollatorForLanguageModeling(DataCollatorMixin):
+    """
+    Data collator used for language modeling. Inputs are dynamically padded to the maximum length of a batch if they
+    are not all of the same length.
+
+    Args:
+        tokenizer ([`PreTrainedTokenizer`] or [`PreTrainedTokenizerFast`]):
+            The tokenizer used for encoding the data.
+        mlm (`bool`, *optional*, defaults to `True`):
+            Whether or not to use masked language modeling. If set to `False`, the labels are the same as the inputs
+            with the padding tokens ignored (by setting them to -100). Otherwise, the labels are -100 for non-masked
+            tokens and the value to predict for the masked token.
+        mlm_probability (`float`, *optional*, defaults to 0.15):
+            The probability with which to (randomly) mask tokens in the input, when `mlm` is set to `True`.
+        pad_to_multiple_of (`int`, *optional*):
+            If set will pad the sequence to a multiple of the provided value.
+
+            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
+            7.0 (Volta).
+        return_tensors (`str`):
+            The type of Tensor to return. Allowable values are "np", "pt" and "tf".
+
+    <Tip>
+
+    For best performance, this data collator should be used with a dataset having items that are dictionaries or
+    BatchEncoding, with the `"special_tokens_mask"` key, as returned by a [`PreTrainedTokenizer`] or a
+    [`PreTrainedTokenizerFast`] with the argument `return_special_tokens_mask=True`.
+
+    </Tip>"""
+
+    tokenizer: PreTrainedTokenizerBase
+    mlm: bool = True
+    mlm_probability: float = 0.15
+    pad_to_multiple_of: Optional[int] = None
+    tf_experimental_compile: bool = False
+    return_tensors: str = "pt"
+
+    def __post_init__(self):
+        if self.mlm and self.tokenizer.mask_token is None:
+            raise ValueError(
+                "This tokenizer does not have a mask token which is necessary for masked language modeling. "
+                "You should pass `mlm=False` to train on causal language modeling instead."
+            )
+        if self.tf_experimental_compile:
+            import tensorflow as tf
+
+            self.tf_mask_tokens = tf.function(self.tf_mask_tokens, jit_compile=True)
+
+    @staticmethod
+    def tf_bernoulli(shape, probability):
+        import tensorflow as tf
+
+        prob_matrix = tf.fill(shape, probability)
+        return tf.cast(prob_matrix - tf.random.uniform(shape, 0, 1) >= 0, tf.bool)
+
+    def tf_mask_tokens(
+        self, inputs: Any, vocab_size, mask_token_id, special_tokens_mask: Optional[Any] = None
+    ) -> Tuple[Any, Any]:
+        """
+        Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original.
+        """
+        import tensorflow as tf
+
+        mask_token_id = tf.cast(mask_token_id, inputs.dtype)
+
+        input_shape = tf.shape(inputs)
+        # 1 for a special token, 0 for a normal token in the special tokens mask
+        # We sample a few tokens in each sequence for MLM training (with probability `self.mlm_probability`)
+        masked_indices = self.tf_bernoulli(input_shape, self.mlm_probability) & ~special_tokens_mask
+        # Replace unmasked indices with -100 in the labels since we only compute loss on masked tokens
+        labels = tf.where(masked_indices, inputs, -100)
+
+        # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
+        indices_replaced = self.tf_bernoulli(input_shape, 0.8) & masked_indices
+
+        inputs = tf.where(indices_replaced, mask_token_id, inputs)
+
+        # 10% of the time, we replace masked input tokens with random word
+        indices_random = self.tf_bernoulli(input_shape, 0.5) & masked_indices & ~indices_replaced
+        random_words = tf.random.uniform(input_shape, maxval=vocab_size, dtype=inputs.dtype)
+
+        inputs = tf.where(indices_random, random_words, inputs)
+
+        # The rest of the time (10% of the time) we keep the masked input tokens unchanged
+        return inputs, labels
+
+    def tf_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]:
+        import tensorflow as tf
+
+        # Handle dict or lists with proper padding and conversion to tensor.
+        if isinstance(examples[0], Mapping):
+            batch = pad_without_fast_tokenizer_warning(
+                self.tokenizer, examples, return_tensors="tf", pad_to_multiple_of=self.pad_to_multiple_of
+            )
+        else:
+            batch = {
+                "input_ids": _tf_collate_batch(examples, self.tokenizer, pad_to_multiple_of=self.pad_to_multiple_of)
+            }
+
+        # If special token mask has been preprocessed, pop it from the dict.
+        special_tokens_mask = batch.pop("special_tokens_mask", None)
+        if self.mlm:
+            if special_tokens_mask is None:
+                special_tokens_mask = [
+                    self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True)
+                    for val in batch["input_ids"].numpy().tolist()
+                ]
+                # Cannot directly create as bool
+                special_tokens_mask = tf.cast(tf.convert_to_tensor(special_tokens_mask, dtype=tf.int64), tf.bool)
+            else:
+                special_tokens_mask = tf.cast(special_tokens_mask, tf.bool)
+            batch["input_ids"], batch["labels"] = self.tf_mask_tokens(
+                tf.cast(batch["input_ids"], tf.int64),
+                special_tokens_mask=special_tokens_mask,
+                mask_token_id=self.tokenizer.mask_token_id,
+                vocab_size=len(self.tokenizer),
+            )
+        else:
+            labels = batch["input_ids"]
+            if self.tokenizer.pad_token_id is not None:
+                # Replace self.tokenizer.pad_token_id with -100
+                labels = tf.where(labels == self.tokenizer.pad_token_id, -100, labels)
+            else:
+                labels = tf.identity(labels)  # Makes a copy, just in case
+            batch["labels"] = labels
+        return batch
+
+    def torch_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]:
+        # Handle dict or lists with proper padding and conversion to tensor.
+        if isinstance(examples[0], Mapping):
+            batch = pad_without_fast_tokenizer_warning(
+                self.tokenizer, examples, return_tensors="pt", pad_to_multiple_of=self.pad_to_multiple_of
+            )
+        else:
+            batch = {
+                "input_ids": _torch_collate_batch(examples, self.tokenizer, pad_to_multiple_of=self.pad_to_multiple_of)
+            }
+
+        # If special token mask has been preprocessed, pop it from the dict.
+        special_tokens_mask = batch.pop("special_tokens_mask", None)
+        if self.mlm:
+            batch["input_ids"], batch["labels"] = self.torch_mask_tokens(
+                batch["input_ids"], special_tokens_mask=special_tokens_mask
+            )
+        else:
+            labels = batch["input_ids"].clone()
+            if self.tokenizer.pad_token_id is not None:
+                labels[labels == self.tokenizer.pad_token_id] = -100
+            batch["labels"] = labels
+        return batch
+
+    def torch_mask_tokens(self, inputs: Any, special_tokens_mask: Optional[Any] = None) -> Tuple[Any, Any]:
+        """
+        Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original.
+        """
+        import torch
+
+        labels = inputs.clone()
+        # We sample a few tokens in each sequence for MLM training (with probability `self.mlm_probability`)
+        probability_matrix = torch.full(labels.shape, self.mlm_probability)
+        if special_tokens_mask is None:
+            special_tokens_mask = [
+                self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()
+            ]
+            special_tokens_mask = torch.tensor(special_tokens_mask, dtype=torch.bool)
+        else:
+            special_tokens_mask = special_tokens_mask.bool()
+
+        probability_matrix.masked_fill_(special_tokens_mask, value=0.0)
+        masked_indices = torch.bernoulli(probability_matrix).bool()
+        labels[~masked_indices] = -100  # We only compute loss on masked tokens
+
+        # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
+        indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices
+        inputs[indices_replaced] = self.tokenizer.convert_tokens_to_ids(self.tokenizer.mask_token)
+
+        # 10% of the time, we replace masked input tokens with random word
+        indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced
+        random_words = torch.randint(len(self.tokenizer), labels.shape, dtype=torch.long)
+        inputs[indices_random] = random_words[indices_random]
+
+        # The rest of the time (10% of the time) we keep the masked input tokens unchanged
+        return inputs, labels
+
+    def numpy_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]:
+        # Handle dict or lists with proper padding and conversion to tensor.
+        if isinstance(examples[0], Mapping):
+            batch = pad_without_fast_tokenizer_warning(
+                self.tokenizer, examples, return_tensors="np", pad_to_multiple_of=self.pad_to_multiple_of
+            )
+        else:
+            batch = {
+                "input_ids": _numpy_collate_batch(examples, self.tokenizer, pad_to_multiple_of=self.pad_to_multiple_of)
+            }
+
+        # If special token mask has been preprocessed, pop it from the dict.
+        special_tokens_mask = batch.pop("special_tokens_mask", None)
+        if self.mlm:
+            batch["input_ids"], batch["labels"] = self.numpy_mask_tokens(
+                batch["input_ids"], special_tokens_mask=special_tokens_mask
+            )
+        else:
+            labels = np.copy(batch["input_ids"])
+            if self.tokenizer.pad_token_id is not None:
+                labels[labels == self.tokenizer.pad_token_id] = -100
+            batch["labels"] = labels
+        return batch
+
+    def numpy_mask_tokens(self, inputs: Any, special_tokens_mask: Optional[Any] = None) -> Tuple[Any, Any]:
+        """
+        Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original.
+        """
+        labels = np.copy(inputs)
+        # We sample a few tokens in each sequence for MLM training (with probability `self.mlm_probability`)
+        probability_matrix = np.full(labels.shape, self.mlm_probability)
+        if special_tokens_mask is None:
+            special_tokens_mask = [
+                self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()
+            ]
+            special_tokens_mask = np.array(special_tokens_mask, dtype=bool)
+        else:
+            special_tokens_mask = special_tokens_mask.astype(bool)
+
+        probability_matrix[special_tokens_mask] = 0
+        # Numpy doesn't have bernoulli, so we use a binomial with 1 trial
+        masked_indices = np.random.binomial(1, probability_matrix, size=probability_matrix.shape).astype(bool)
+        labels[~masked_indices] = -100  # We only compute loss on masked tokens
+
+        # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
+        indices_replaced = np.random.binomial(1, 0.8, size=labels.shape).astype(bool) & masked_indices
+        inputs[indices_replaced] = self.tokenizer.mask_token_id
+
+        # 10% of the time, we replace masked input tokens with random word
+        # indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced
+        indices_random = (
+            np.random.binomial(1, 0.5, size=labels.shape).astype(bool) & masked_indices & ~indices_replaced
+        )
+        random_words = np.random.randint(
+            low=0, high=len(self.tokenizer), size=np.count_nonzero(indices_random), dtype=np.int64
+        )
+        inputs[indices_random] = random_words
+
+        # The rest of the time (10% of the time) we keep the masked input tokens unchanged
+        return inputs, labels
+
+
+@dataclass
+class DataCollatorForWholeWordMask(DataCollatorForLanguageModeling):
+    """
+    Data collator used for language modeling that masks entire words.
+
+    - collates batches of tensors, honoring their tokenizer's pad_token
+    - preprocesses batches for masked language modeling
+
+    <Tip>
+
+    This collator relies on details of the implementation of subword tokenization by [`BertTokenizer`], specifically
+    that subword tokens are prefixed with *##*. For tokenizers that do not adhere to this scheme, this collator will
+    produce an output that is roughly equivalent to [`.DataCollatorForLanguageModeling`].
+
+    </Tip>"""
+
+    def torch_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]:
+        if isinstance(examples[0], Mapping):
+            input_ids = [e["input_ids"] for e in examples]
+        else:
+            input_ids = examples
+            examples = [{"input_ids": e} for e in examples]
+
+        batch_input = _torch_collate_batch(input_ids, self.tokenizer, pad_to_multiple_of=self.pad_to_multiple_of)
+
+        mask_labels = []
+        for e in examples:
+            ref_tokens = []
+            for id in tolist(e["input_ids"]):
+                token = self.tokenizer._convert_id_to_token(id)
+                ref_tokens.append(token)
+
+            # For Chinese tokens, we need extra inf to mark sub-word, e.g [喜,欢]-> [喜，##欢]
+            if "chinese_ref" in e:
+                ref_pos = tolist(e["chinese_ref"])
+                len_seq = len(e["input_ids"])
+                for i in range(len_seq):
+                    if i in ref_pos:
+                        ref_tokens[i] = "##" + ref_tokens[i]
+            mask_labels.append(self._whole_word_mask(ref_tokens))
+        batch_mask = _torch_collate_batch(mask_labels, self.tokenizer, pad_to_multiple_of=self.pad_to_multiple_of)
+        inputs, labels = self.torch_mask_tokens(batch_input, batch_mask)
+        return {"input_ids": inputs, "labels": labels}
+
+    def tf_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]:
+        import tensorflow as tf
+
+        if isinstance(examples[0], Mapping):
+            input_ids = [e["input_ids"] for e in examples]
+        else:
+            input_ids = examples
+            examples = [{"input_ids": e} for e in examples]
+
+        batch_input = _tf_collate_batch(input_ids, self.tokenizer, pad_to_multiple_of=self.pad_to_multiple_of)
+
+        mask_labels = []
+        for e in examples:
+            ref_tokens = []
+            for id in tolist(e["input_ids"]):
+                token = self.tokenizer._convert_id_to_token(id)
+                ref_tokens.append(token)
+
+            # For Chinese tokens, we need extra inf to mark sub-word, e.g [喜,欢]-> [喜，##欢]
+            if "chinese_ref" in e:
+                ref_pos = tolist(e["chinese_ref"])
+                len_seq = len(e["input_ids"])
+                for i in range(len_seq):
+                    if i in ref_pos:
+                        ref_tokens[i] = "##" + ref_tokens[i]
+            mask_labels.append(self._whole_word_mask(ref_tokens))
+        batch_mask = _tf_collate_batch(mask_labels, self.tokenizer, pad_to_multiple_of=self.pad_to_multiple_of)
+        inputs, labels = self.tf_mask_tokens(tf.cast(batch_input, tf.int64), batch_mask)
+        return {"input_ids": inputs, "labels": labels}
+
+    def numpy_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]:
+        if isinstance(examples[0], Mapping):
+            input_ids = [e["input_ids"] for e in examples]
+        else:
+            input_ids = examples
+            examples = [{"input_ids": e} for e in examples]
+
+        batch_input = _numpy_collate_batch(input_ids, self.tokenizer, pad_to_multiple_of=self.pad_to_multiple_of)
+
+        mask_labels = []
+        for e in examples:
+            ref_tokens = []
+            for id in tolist(e["input_ids"]):
+                token = self.tokenizer._convert_id_to_token(id)
+                ref_tokens.append(token)
+
+            # For Chinese tokens, we need extra inf to mark sub-word, e.g [喜,欢]-> [喜，##欢]
+            if "chinese_ref" in e:
+                ref_pos = tolist(e["chinese_ref"])
+                len_seq = len(e["input_ids"])
+                for i in range(len_seq):
+                    if i in ref_pos:
+                        ref_tokens[i] = "##" + ref_tokens[i]
+            mask_labels.append(self._whole_word_mask(ref_tokens))
+        batch_mask = _numpy_collate_batch(mask_labels, self.tokenizer, pad_to_multiple_of=self.pad_to_multiple_of)
+        inputs, labels = self.numpy_mask_tokens(batch_input, batch_mask)
+        return {"input_ids": inputs, "labels": labels}
+
+    def _whole_word_mask(self, input_tokens: List[str], max_predictions=512):
+        """
+        Get 0/1 labels for masked tokens with whole word mask proxy
+        """
+        if not isinstance(self.tokenizer, (BertTokenizer, BertTokenizerFast)):
+            warnings.warn(
+                "DataCollatorForWholeWordMask is only suitable for BertTokenizer-like tokenizers. "
+                "Please refer to the documentation for more information."
+            )
+
+        cand_indexes = []
+        for i, token in enumerate(input_tokens):
+            if token == "[CLS]" or token == "[SEP]":
+                continue
+
+            if len(cand_indexes) >= 1 and token.startswith("##"):
+                cand_indexes[-1].append(i)
+            else:
+                cand_indexes.append([i])
+
+        random.shuffle(cand_indexes)
+        num_to_predict = min(max_predictions, max(1, int(round(len(input_tokens) * self.mlm_probability))))
+        masked_lms = []
+        covered_indexes = set()
+        for index_set in cand_indexes:
+            if len(masked_lms) >= num_to_predict:
+                break
+            # If adding a whole-word mask would exceed the maximum number of
+            # predictions, then just skip this candidate.
+            if len(masked_lms) + len(index_set) > num_to_predict:
+                continue
+            is_any_index_covered = False
+            for index in index_set:
+                if index in covered_indexes:
+                    is_any_index_covered = True
+                    break
+            if is_any_index_covered:
+                continue
+            for index in index_set:
+                covered_indexes.add(index)
+                masked_lms.append(index)
+
+        if len(covered_indexes) != len(masked_lms):
+            raise ValueError("Length of covered_indexes is not equal to length of masked_lms.")
+        mask_labels = [1 if i in covered_indexes else 0 for i in range(len(input_tokens))]
+        return mask_labels
+
+    def torch_mask_tokens(self, inputs: Any, mask_labels: Any) -> Tuple[Any, Any]:
+        """
+        Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. Set
+        'mask_labels' means we use whole word mask (wwm), we directly mask idxs according to it's ref.
+        """
+        import torch
+
+        if self.tokenizer.mask_token is None:
+            raise ValueError(
+                "This tokenizer does not have a mask token which is necessary for masked language modeling. Remove the"
+                " --mlm flag if you want to use this tokenizer."
+            )
+        labels = inputs.clone()
+        # We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa)
+
+        probability_matrix = mask_labels
+
+        special_tokens_mask = [
+            self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()
+        ]
+        probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0)
+        if self.tokenizer.pad_token is not None:
+            padding_mask = labels.eq(self.tokenizer.pad_token_id)
+            probability_matrix.masked_fill_(padding_mask, value=0.0)
+
+        masked_indices = probability_matrix.bool()
+        labels[~masked_indices] = -100  # We only compute loss on masked tokens
+
+        # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
+        indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices
+        inputs[indices_replaced] = self.tokenizer.convert_tokens_to_ids(self.tokenizer.mask_token)
+
+        # 10% of the time, we replace masked input tokens with random word
+        indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced
+        random_words = torch.randint(len(self.tokenizer), labels.shape, dtype=torch.long)
+        inputs[indices_random] = random_words[indices_random]
+
+        # The rest of the time (10% of the time) we keep the masked input tokens unchanged
+        return inputs, labels
+
+    def tf_mask_tokens(self, inputs: Any, mask_labels: Any) -> Tuple[Any, Any]:
+        """
+        Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. Set
+        'mask_labels' means we use whole word mask (wwm), we directly mask idxs according to it's ref.
+        """
+        import tensorflow as tf
+
+        input_shape = tf.shape(inputs)
+        if self.tokenizer.mask_token is None:
+            raise ValueError(
+                "This tokenizer does not have a mask token which is necessary for masked language modeling. Remove the"
+                " --mlm flag if you want to use this tokenizer."
+            )
+        labels = tf.identity(inputs)
+        # We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa)
+
+        masked_indices = tf.cast(mask_labels, tf.bool)
+
+        special_tokens_mask = [
+            self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels
+        ]
+        masked_indices = masked_indices & ~tf.cast(special_tokens_mask, dtype=tf.bool)
+        if self.tokenizer.pad_token is not None:
+            padding_mask = inputs == self.tokenizer.pad_token_id
+            masked_indices = masked_indices & ~padding_mask
+
+        # Replace unmasked indices with -100 in the labels since we only compute loss on masked tokens
+        labels = tf.where(masked_indices, inputs, -100)
+
+        # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
+        indices_replaced = self.tf_bernoulli(input_shape, 0.8) & masked_indices
+
+        inputs = tf.where(indices_replaced, self.tokenizer.mask_token_id, inputs)
+
+        # 10% of the time, we replace masked input tokens with random word
+        indices_random = self.tf_bernoulli(input_shape, 0.5) & masked_indices & ~indices_replaced
+        random_words = tf.random.uniform(input_shape, maxval=len(self.tokenizer), dtype=tf.int64)
+        inputs = tf.where(indices_random, random_words, inputs)
+
+        # The rest of the time (10% of the time) we keep the masked input tokens unchanged
+        return inputs, labels
+
+    def numpy_mask_tokens(self, inputs: Any, mask_labels: Any) -> Tuple[Any, Any]:
+        """
+        Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. Set
+        'mask_labels' means we use whole word mask (wwm), we directly mask idxs according to it's ref.
+        """
+        if self.tokenizer.mask_token is None:
+            raise ValueError(
+                "This tokenizer does not have a mask token which is necessary for masked language modeling. Remove the"
+                " --mlm flag if you want to use this tokenizer."
+            )
+        labels = np.copy(inputs)
+        # We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa)
+
+        masked_indices = mask_labels.astype(bool)
+
+        special_tokens_mask = [
+            self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()
+        ]
+        masked_indices[np.array(special_tokens_mask, dtype=bool)] = 0
+        if self.tokenizer.pad_token is not None:
+            padding_mask = labels == self.tokenizer.pad_token_id
+            masked_indices[padding_mask] = 0
+
+        labels[~masked_indices] = -100  # We only compute loss on masked tokens
+
+        # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
+        indices_replaced = np.random.binomial(1, 0.8, size=labels.shape).astype(bool) & masked_indices
+        inputs[indices_replaced] = self.tokenizer.convert_tokens_to_ids(self.tokenizer.mask_token)
+
+        # 10% of the time, we replace masked input tokens with random word
+        # indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced
+        indices_random = (
+            np.random.binomial(1, 0.5, size=labels.shape).astype(bool) & masked_indices & ~indices_replaced
+        )
+        random_words = np.random.randint(low=0, high=len(self.tokenizer), size=labels.shape, dtype=np.int64)
+        inputs[indices_random] = random_words[indices_random]
+
+        # The rest of the time (10% of the time) we keep the masked input tokens unchanged
+        return inputs, labels
+
+
+@dataclass
+class DataCollatorForSOP(DataCollatorForLanguageModeling):
+    """
+    Data collator used for sentence order prediction task.
+
+    - collates batches of tensors, honoring their tokenizer's pad_token
+    - preprocesses batches for both masked language modeling and sentence order prediction
+    """
+
+    def __init__(self, *args, **kwargs):
+        warnings.warn(
+            "DataCollatorForSOP is deprecated and will be removed in a future version, you can now use "
+            "DataCollatorForLanguageModeling instead.",
+            FutureWarning,
+        )
+
+    def __call__(self, examples: List[Dict[str, Any]]) -> Dict[str, Any]:
+        import torch
+        from torch.nn.utils.rnn import pad_sequence
+
+        input_ids = [example["input_ids"] for example in examples]
+        input_ids = _torch_collate_batch(input_ids, self.tokenizer)
+        input_ids, labels, attention_mask = self.mask_tokens(input_ids)
+
+        token_type_ids = [example["token_type_ids"] for example in examples]
+        # size of segment_ids varied because randomness, padding zero to the end as the original implementation
+        token_type_ids = pad_sequence(token_type_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id)
+
+        sop_label_list = [example["sentence_order_label"] for example in examples]
+        sentence_order_label = torch.stack(sop_label_list)
+
+        return {
+            "input_ids": input_ids,
+            "labels": labels,
+            "attention_mask": attention_mask,
+            "token_type_ids": token_type_ids,
+            "sentence_order_label": sentence_order_label,
+        }
+
+    def mask_tokens(self, inputs: Any) -> Tuple[Any, Any, Any]:
+        """
+        Prepare masked tokens inputs/labels/attention_mask for masked language modeling: 80% MASK, 10% random, 10%
+        original. N-gram not applied yet.
+        """
+        import torch
+
+        if self.tokenizer.mask_token is None:
+            raise ValueError(
+                "This tokenizer does not have a mask token which is necessary for masked language modeling. Remove the"
+                " --mlm flag if you want to use this tokenizer."
+            )
+
+        labels = inputs.clone()
+        # We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa)
+        probability_matrix = torch.full(labels.shape, self.mlm_probability)
+        special_tokens_mask = [
+            self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()
+        ]
+        probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0)
+        if self.tokenizer.pad_token is not None:
+            padding_mask = labels.eq(self.tokenizer.pad_token_id)
+            probability_matrix.masked_fill_(padding_mask, value=0.0)
+        masked_indices = torch.bernoulli(probability_matrix).bool()
+        # probability be `1` (masked), however in albert model attention mask `0` means masked, revert the value
+        attention_mask = (~masked_indices).float()
+        if self.tokenizer.pad_token is not None:
+            attention_padding_mask = labels.eq(self.tokenizer.pad_token_id)
+            attention_mask.masked_fill_(attention_padding_mask, value=1.0)
+        labels[~masked_indices] = -100  # We only compute loss on masked tokens, -100 is default for CE compute
+
+        # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
+        indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices
+        inputs[indices_replaced] = self.tokenizer.convert_tokens_to_ids(self.tokenizer.mask_token)
+
+        # 10% of the time, we replace masked input tokens with random word
+        indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced
+        random_words = torch.randint(len(self.tokenizer), labels.shape, dtype=torch.long)
+        inputs[indices_random] = random_words[indices_random]
+
+        # The rest of the time (10% of the time) we keep the masked input tokens unchanged
+        return inputs, labels, attention_mask
+
+
+@dataclass
+class DataCollatorForPermutationLanguageModeling(DataCollatorMixin):
+    """
+    Data collator used for permutation language modeling.
+
+    - collates batches of tensors, honoring their tokenizer's pad_token
+    - preprocesses batches for permutation language modeling with procedures specific to XLNet
+    """
+
+    tokenizer: PreTrainedTokenizerBase
+    plm_probability: float = 1 / 6
+    max_span_length: int = 5  # maximum length of a span of masked tokens
+    return_tensors: str = "pt"
+
+    def torch_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]:
+        if isinstance(examples[0], Mapping):
+            examples = [e["input_ids"] for e in examples]
+        batch = _torch_collate_batch(examples, self.tokenizer)
+        inputs, perm_mask, target_mapping, labels = self.torch_mask_tokens(batch)
+        return {"input_ids": inputs, "perm_mask": perm_mask, "target_mapping": target_mapping, "labels": labels}
+
+    def tf_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]:
+        if isinstance(examples[0], Mapping):
+            examples = [e["input_ids"] for e in examples]
+        batch = _tf_collate_batch(examples, self.tokenizer)
+        inputs, perm_mask, target_mapping, labels = self.tf_mask_tokens(batch)
+        return {"input_ids": inputs, "perm_mask": perm_mask, "target_mapping": target_mapping, "labels": labels}
+
+    def numpy_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]:
+        if isinstance(examples[0], Mapping):
+            examples = [e["input_ids"] for e in examples]
+        batch = _numpy_collate_batch(examples, self.tokenizer)
+        inputs, perm_mask, target_mapping, labels = self.numpy_mask_tokens(batch)
+        return {"input_ids": inputs, "perm_mask": perm_mask, "target_mapping": target_mapping, "labels": labels}
+
+    def torch_mask_tokens(self, inputs: Any) -> Tuple[Any, Any, Any, Any]:
+        """
+        The masked tokens to be predicted for a particular sequence are determined by the following algorithm:
+
+            0. Start from the beginning of the sequence by setting `cur_len = 0` (number of tokens processed so far).
+            1. Sample a `span_length` from the interval `[1, max_span_length]` (length of span of tokens to be masked)
+            2. Reserve a context of length `context_length = span_length / plm_probability` to surround span to be
+               masked
+            3. Sample a starting point `start_index` from the interval `[cur_len, cur_len + context_length -
+               span_length]` and mask tokens `start_index:start_index + span_length`
+            4. Set `cur_len = cur_len + context_length`. If `cur_len < max_len` (i.e. there are tokens remaining in the
+               sequence to be processed), repeat from Step 1.
+        """
+        import torch
+
+        if self.tokenizer.mask_token is None:
+            raise ValueError(
+                "This tokenizer does not have a mask token which is necessary for permutation language modeling."
+                " Please add a mask token if you want to use this tokenizer."
+            )
+
+        if inputs.size(1) % 2 != 0:
+            raise ValueError(
+                "This collator requires that sequence lengths be even to create a leakage-free perm_mask. Please see"
+                " relevant comments in source code for details."
+            )
+
+        labels = inputs.clone()
+        # Creating the mask and target_mapping tensors
+        masked_indices = torch.full(labels.shape, 0, dtype=torch.bool)
+        target_mapping = torch.zeros((labels.size(0), labels.size(1), labels.size(1)), dtype=torch.float32)
+
+        for i in range(labels.size(0)):
+            # Start from the beginning of the sequence by setting `cur_len = 0` (number of tokens processed so far).
+            cur_len = 0
+            max_len = labels.size(1)
+
+            while cur_len < max_len:
+                # Sample a `span_length` from the interval `[1, max_span_length]` (length of span of tokens to be masked)
+                span_length = torch.randint(1, self.max_span_length + 1, (1,)).item()
+                # Reserve a context of length `context_length = span_length / plm_probability` to surround the span to be masked
+                context_length = int(span_length / self.plm_probability)
+                # Sample a starting point `start_index` from the interval `[cur_len, cur_len + context_length - span_length]` and mask tokens `start_index:start_index + span_length`
+                start_index = cur_len + torch.randint(context_length - span_length + 1, (1,)).item()
+                masked_indices[i, start_index : start_index + span_length] = 1
+                # Set `cur_len = cur_len + context_length`
+                cur_len += context_length
+
+            # Since we're replacing non-masked tokens with -100 in the labels tensor instead of skipping them altogether,
+            # the i-th predict corresponds to the i-th token.
+            target_mapping[i] = torch.eye(labels.size(1))
+
+        special_tokens_mask = torch.tensor(
+            [self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()],
+            dtype=torch.bool,
+        )
+        masked_indices.masked_fill_(special_tokens_mask, value=0.0)
+        if self.tokenizer.pad_token is not None:
+            padding_mask = labels.eq(self.tokenizer.pad_token_id)
+            masked_indices.masked_fill_(padding_mask, value=0.0)
+
+        # Mask indicating non-functional tokens, where functional tokens are [SEP], [CLS], padding, etc.
+        non_func_mask = ~(padding_mask | special_tokens_mask)
+
+        inputs[masked_indices] = self.tokenizer.mask_token_id
+        labels[~masked_indices] = -100  # We only compute loss on masked tokens
+
+        perm_mask = torch.zeros((labels.size(0), labels.size(1), labels.size(1)), dtype=torch.float32)
+
+        for i in range(labels.size(0)):
+            # Generate permutation indices i.e. sample a random factorisation order for the sequence. This will
+            # determine which tokens a given token can attend to (encoded in `perm_mask`).
+            # Note: Length of token sequence being permuted has to be less than or equal to reused sequence length
+            # (see documentation for `mems`), otherwise information may leak through due to reuse. In this implementation,
+            # we assume that reused length is half of sequence length and permutation length is equal to reused length.
+            # This requires that the sequence length be even.
+
+            # Create a linear factorisation order
+            perm_index = torch.arange(labels.size(1))
+            # Split this into two halves, assuming that half the sequence is reused each time
+            perm_index = perm_index.reshape((-1, labels.size(1) // 2)).transpose(0, 1)
+            # Permute the two halves such that they do not cross over
+            perm_index = perm_index[torch.randperm(labels.size(1) // 2)]
+            # Flatten this out into the desired permuted factorisation order
+            perm_index = torch.flatten(perm_index.transpose(0, 1))
+            # Set the permutation indices of non-masked (non-functional) tokens to the
+            # smallest index (-1) so that:
+            # (1) They can be seen by all other positions
+            # (2) They cannot see masked positions, so there won't be information leak
+            perm_index.masked_fill_(~masked_indices[i] & non_func_mask[i], -1)
+            # The logic for whether the i-th token can attend on the j-th token based on the factorisation order:
+            # 0 (can attend): If perm_index[i] > perm_index[j] or j is neither masked nor a functional token
+            # 1 (cannot attend): If perm_index[i] <= perm_index[j] and j is either masked or a functional token
+            perm_mask[i] = (
+                perm_index.reshape((labels.size(1), 1)) <= perm_index.reshape((1, labels.size(1)))
+            ) & masked_indices[i]
+
+        return inputs.long(), perm_mask, target_mapping, labels.long()
+
+    def tf_mask_tokens(self, inputs: Any) -> Tuple[Any, Any, Any, Any]:
+        """
+        The masked tokens to be predicted for a particular sequence are determined by the following algorithm:
+
+            0. Start from the beginning of the sequence by setting `cur_len = 0` (number of tokens processed so far).
+            1. Sample a `span_length` from the interval `[1, max_span_length]` (length of span of tokens to be masked)
+            2. Reserve a context of length `context_length = span_length / plm_probability` to surround span to be
+               masked
+            3. Sample a starting point `start_index` from the interval `[cur_len, cur_len + context_length -
+               span_length]` and mask tokens `start_index:start_index + span_length`
+            4. Set `cur_len = cur_len + context_length`. If `cur_len < max_len` (i.e. there are tokens remaining in the
+               sequence to be processed), repeat from Step 1.
+        """
+        import tensorflow as tf
+
+        if self.tokenizer.mask_token is None:
+            raise ValueError(
+                "This tokenizer does not have a mask token which is necessary for permutation language modeling."
+                " Please add a mask token if you want to use this tokenizer."
+            )
+
+        if tf.shape(inputs)[1] % 2 != 0:
+            raise ValueError(
+                "This collator requires that sequence lengths be even to create a leakage-free perm_mask. Please see"
+                " relevant comments in source code for details."
+            )
+
+        labels = tf.identity(inputs)
+        # Creating the mask and target_mapping tensors
+        masked_indices = np.full(labels.shape.as_list(), 0, dtype=bool)
+        labels_shape = tf.shape(labels)
+        target_mapping = np.zeros((labels_shape[0], labels_shape[1], labels_shape[1]), dtype=np.float32)
+
+        for i in range(len(labels)):
+            # Start from the beginning of the sequence by setting `cur_len = 0` (number of tokens processed so far).
+            cur_len = 0
+            max_len = tf.shape(labels)[1]
+
+            while cur_len < max_len:
+                # Sample a `span_length` from the interval `[1, max_span_length]` (length of span of tokens to be masked)
+                span_length = randint(1, self.max_span_length + 1)
+                # Reserve a context of length `context_length = span_length / plm_probability` to surround the span to be masked
+                context_length = int(span_length / self.plm_probability)
+                # Sample a starting point `start_index` from the interval `[cur_len, cur_len + context_length - span_length]` and mask tokens `start_index:start_index + span_length`
+                start_index = cur_len + randint(0, context_length - span_length + 1)
+                masked_indices[i, start_index : start_index + span_length] = 1
+                # Set `cur_len = cur_len + context_length`
+                cur_len += context_length
+
+            # Since we're replacing non-masked tokens with -100 in the labels tensor instead of skipping them altogether,
+            # the i-th predict corresponds to the i-th token.
+            target_mapping[i] = np.eye(labels_shape[1])
+        masked_indices = tf.cast(tf.convert_to_tensor(masked_indices), dtype=tf.bool)
+        target_mapping = tf.convert_to_tensor(target_mapping)
+        special_tokens_mask = tf.convert_to_tensor(
+            [
+                self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True)
+                for val in labels.numpy().tolist()
+            ],
+        )
+        special_tokens_mask = tf.cast(special_tokens_mask, dtype=tf.bool)
+        masked_indices = masked_indices & ~special_tokens_mask
+        if self.tokenizer.pad_token is not None:
+            padding_mask = labels == self.tokenizer.pad_token_id
+            masked_indices = masked_indices & ~padding_mask
+
+        # Mask indicating non-functional tokens, where functional tokens are [SEP], [CLS], padding, etc.
+        non_func_mask = ~(padding_mask | special_tokens_mask)
+
+        inputs = tf.where(masked_indices, self.tokenizer.mask_token_id, inputs)
+        labels = tf.where(masked_indices, labels, -100)  # We only compute loss on masked tokens
+
+        perm_mask = []
+
+        for i in range(len(labels)):
+            # Generate permutation indices i.e. sample a random factorisation order for the sequence. This will
+            # determine which tokens a given token can attend to (encoded in `perm_mask`).
+            # Note: Length of token sequence being permuted has to be less than or equal to reused sequence length
+            # (see documentation for `mems`), otherwise information may leak through due to reuse. In this implementation,
+            # we assume that reused length is half of sequence length and permutation length is equal to reused length.
+            # This requires that the sequence length be even.
+
+            # Create a linear factorisation order
+            # tf.range is the equivalent of torch.arange
+            perm_index = tf.range(labels_shape[1])
+            # Split this into two halves, assuming that half the sequence is reused each time
+            perm_index = tf.transpose(tf.reshape(perm_index, (-1, labels_shape[1] // 2)))
+            # Permute the two halves such that they do not cross over
+            perm_index = tf.random.shuffle(perm_index)  # Shuffles along the first dimension
+            # Flatten this out into the desired permuted factorisation order
+            perm_index = tf.reshape(tf.transpose(perm_index), (-1,))
+            # Set the permutation indices of non-masked (non-functional) tokens to the
+            # smallest index (-1) so that:
+            # (1) They can be seen by all other positions
+            # (2) They cannot see masked positions, so there won't be information leak
+            perm_index = tf.where(~masked_indices[i] & non_func_mask[i], -1, perm_index)
+            # The logic for whether the i-th token can attend on the j-th token based on the factorisation order:
+            # 0 (can attend): If perm_index[i] > perm_index[j] or j is neither masked nor a functional token
+            # 1 (cannot attend): If perm_index[i] <= perm_index[j] and j is either masked or a functional token
+            perm_mask.append(
+                (tf.reshape(perm_index, (labels_shape[1], 1)) <= tf.reshape(perm_index, (1, labels_shape[1])))
+                & masked_indices[i]
+            )
+        perm_mask = tf.stack(perm_mask, axis=0)
+
+        return tf.cast(inputs, tf.int64), tf.cast(perm_mask, tf.float32), target_mapping, tf.cast(labels, tf.int64)
+
+    def numpy_mask_tokens(self, inputs: Any) -> Tuple[Any, Any, Any, Any]:
+        """
+        The masked tokens to be predicted for a particular sequence are determined by the following algorithm:
+
+            0. Start from the beginning of the sequence by setting `cur_len = 0` (number of tokens processed so far).
+            1. Sample a `span_length` from the interval `[1, max_span_length]` (length of span of tokens to be masked)
+            2. Reserve a context of length `context_length = span_length / plm_probability` to surround span to be
+               masked
+            3. Sample a starting point `start_index` from the interval `[cur_len, cur_len + context_length -
+               span_length]` and mask tokens `start_index:start_index + span_length`
+            4. Set `cur_len = cur_len + context_length`. If `cur_len < max_len` (i.e. there are tokens remaining in the
+               sequence to be processed), repeat from Step 1.
+        """
+        if self.tokenizer.mask_token is None:
+            raise ValueError(
+                "This tokenizer does not have a mask token which is necessary for permutation language modeling."
+                " Please add a mask token if you want to use this tokenizer."
+            )
+
+        if inputs.shape[1] % 2 != 0:
+            raise ValueError(
+                "This collator requires that sequence lengths be even to create a leakage-free perm_mask. Please see"
+                " relevant comments in source code for details."
+            )
+
+        labels = np.copy(inputs)
+        # Creating the mask and target_mapping tensors
+        masked_indices = np.full(labels.shape, 0, dtype=bool)
+        target_mapping = np.zeros((labels.shape[0], labels.shape[1], labels.shape[1]), dtype=np.float32)
+
+        for i in range(labels.shape[0]):
+            # Start from the beginning of the sequence by setting `cur_len = 0` (number of tokens processed so far).
+            cur_len = 0
+            max_len = labels.shape[1]
+
+            while cur_len < max_len:
+                # Sample a `span_length` from the interval `[1, max_span_length]` (length of span of tokens to be masked)
+                span_length = randint(1, self.max_span_length + 1)
+                # Reserve a context of length `context_length = span_length / plm_probability` to surround the span to be masked
+                context_length = int(span_length / self.plm_probability)
+                # Sample a starting point `start_index` from the interval `[cur_len, cur_len + context_length - span_length]` and mask tokens `start_index:start_index + span_length`
+                start_index = cur_len + randint(0, context_length - span_length + 1)
+                masked_indices[i, start_index : start_index + span_length] = 1
+                # Set `cur_len = cur_len + context_length`
+                cur_len += context_length
+
+            # Since we're replacing non-masked tokens with -100 in the labels tensor instead of skipping them altogether,
+            # the i-th predict corresponds to the i-th token.
+            target_mapping[i] = np.eye(labels.shape[1])
+
+        special_tokens_mask = np.array(
+            [self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()],
+            dtype=bool,
+        )
+        masked_indices[special_tokens_mask] = 0
+        if self.tokenizer.pad_token is not None:
+            padding_mask = labels == self.tokenizer.pad_token_id
+            masked_indices[padding_mask] = 0.0
+
+        # Mask indicating non-functional tokens, where functional tokens are [SEP], [CLS], padding, etc.
+        non_func_mask = ~(padding_mask | special_tokens_mask)
+
+        inputs[masked_indices] = self.tokenizer.mask_token_id
+        labels[~masked_indices] = -100  # We only compute loss on masked tokens
+
+        perm_mask = np.zeros((labels.shape[0], labels.shape[1], labels.shape[1]), dtype=np.float32)
+
+        for i in range(labels.shape[0]):
+            # Generate permutation indices i.e. sample a random factorisation order for the sequence. This will
+            # determine which tokens a given token can attend to (encoded in `perm_mask`).
+            # Note: Length of token sequence being permuted has to be less than or equal to reused sequence length
+            # (see documentation for `mems`), otherwise information may leak through due to reuse. In this implementation,
+            # we assume that reused length is half of sequence length and permutation length is equal to reused length.
+            # This requires that the sequence length be even.
+
+            # Create a linear factorisation order
+            perm_index = np.arange(labels.shape[1])
+            # Split this into two halves, assuming that half the sequence is reused each time
+            perm_index = perm_index.reshape((-1, labels.shape[1] // 2)).T
+            # Permute the two halves such that they do not cross over
+            np.random.shuffle(perm_index)
+            # Flatten this out into the desired permuted factorisation order
+            perm_index = perm_index.T.flatten()
+            # Set the permutation indices of non-masked (non-functional) tokens to the
+            # smallest index (-1) so that:
+            # (1) They can be seen by all other positions
+            # (2) They cannot see masked positions, so there won't be information leak
+            perm_index[~masked_indices[i] & non_func_mask[i]] = -1
+            # The logic for whether the i-th token can attend on the j-th token based on the factorisation order:
+            # 0 (can attend): If perm_index[i] > perm_index[j] or j is neither masked nor a functional token
+            # 1 (cannot attend): If perm_index[i] <= perm_index[j] and j is either masked or a functional token
+            perm_mask[i] = (
+                perm_index.reshape((labels.shape[1], 1)) <= perm_index.reshape((1, labels.shape[1]))
+            ) & masked_indices[i]
+
+        return inputs.astype(np.int64), perm_mask, target_mapping, labels.astype(np.int64)
+
+
+@dataclass
+class DataCollatorWithFlattening(DefaultDataCollator):
+    """
+    Data collator used for padding free approach. Does the following:
+
+    - concatate the entire mini batch into single long sequence [1, total_tokens]
+    - uses `separator_id` to separate sequences within the concatenated `labels`, default value is -100
+    - no padding will be added, returns `input_ids`, `labels` and `position_ids`
+    """
+
+    def __init__(self, *args, return_position_ids=True, separator_id=-100, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.return_position_ids = return_position_ids
+        self.separator_id = separator_id
+        warnings.warn(
+            "Using `DataCollatorWithFlattening` will flatten the entire mini batch into single long sequence."
+            "Make sure your attention computation is able to handle it!"
+        )
+
+    def __call__(self, features, return_tensors=None, separator_id=None):
+        if return_tensors is None:
+            return_tensors = self.return_tensors
+        if separator_id is None:
+            separator_id = self.separator_id
+        is_labels_provided = "labels" in features[0]
+        ret = {"input_ids": [], "labels": []}
+        if self.return_position_ids:
+            ret.update({"position_ids": []})
+        for idx in range(0, len(features)):
+            ret["input_ids"] += features[idx]["input_ids"]
+            if is_labels_provided:
+                ret["labels"] += [separator_id] + features[idx]["labels"][1:]
+            else:
+                ret["labels"] += [separator_id] + features[idx]["input_ids"][1:]
+            if self.return_position_ids:
+                ret["position_ids"] += list(range(len(features[idx]["input_ids"])))
+        return default_data_collator([ret], return_tensors)
diff --git a/.venv/lib/python3.11/site-packages/transformers/data/datasets/__init__.py b/.venv/lib/python3.11/site-packages/transformers/data/datasets/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..378894ab4bbb4704b67b1de4ab512f145b889d46
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/data/datasets/__init__.py
@@ -0,0 +1,23 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .glue import GlueDataset, GlueDataTrainingArguments
+from .language_modeling import (
+    LineByLineTextDataset,
+    LineByLineWithRefDataset,
+    LineByLineWithSOPTextDataset,
+    TextDataset,
+    TextDatasetForNextSentencePrediction,
+)
+from .squad import SquadDataset, SquadDataTrainingArguments
diff --git a/.venv/lib/python3.11/site-packages/transformers/data/datasets/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/data/datasets/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..31e2d4f2cbdc1d8c8bee4a9f5b9d22b439d7451a
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/data/datasets/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/data/datasets/__pycache__/glue.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/data/datasets/__pycache__/glue.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..840cefbd2e0647f405f253bbec6b2e8f00419254
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/data/datasets/__pycache__/glue.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/data/datasets/__pycache__/language_modeling.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/data/datasets/__pycache__/language_modeling.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0dc1791c237718afadc67000232eba7c1260378d
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/data/datasets/__pycache__/language_modeling.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/data/datasets/__pycache__/squad.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/data/datasets/__pycache__/squad.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3e06ee3c78af972c4a44ceb812d8268ef7098250
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/data/datasets/__pycache__/squad.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/data/datasets/glue.py b/.venv/lib/python3.11/site-packages/transformers/data/datasets/glue.py
new file mode 100644
index 0000000000000000000000000000000000000000..72df3bece21925d15748d53bd82def67bfdd82bb
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/data/datasets/glue.py
@@ -0,0 +1,161 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import time
+import warnings
+from dataclasses import dataclass, field
+from enum import Enum
+from typing import List, Optional, Union
+
+import torch
+from filelock import FileLock
+from torch.utils.data import Dataset
+
+from ...tokenization_utils_base import PreTrainedTokenizerBase
+from ...utils import logging
+from ..processors.glue import glue_convert_examples_to_features, glue_output_modes, glue_processors
+from ..processors.utils import InputFeatures
+
+
+logger = logging.get_logger(__name__)
+
+
+@dataclass
+class GlueDataTrainingArguments:
+    """
+    Arguments pertaining to what data we are going to input our model for training and eval.
+
+    Using `HfArgumentParser` we can turn this class into argparse arguments to be able to specify them on the command
+    line.
+    """
+
+    task_name: str = field(metadata={"help": "The name of the task to train on: " + ", ".join(glue_processors.keys())})
+    data_dir: str = field(
+        metadata={"help": "The input data dir. Should contain the .tsv files (or other data files) for the task."}
+    )
+    max_seq_length: int = field(
+        default=128,
+        metadata={
+            "help": (
+                "The maximum total input sequence length after tokenization. Sequences longer "
+                "than this will be truncated, sequences shorter will be padded."
+            )
+        },
+    )
+    overwrite_cache: bool = field(
+        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
+    )
+
+    def __post_init__(self):
+        self.task_name = self.task_name.lower()
+
+
+class Split(Enum):
+    train = "train"
+    dev = "dev"
+    test = "test"
+
+
+class GlueDataset(Dataset):
+    """
+    This will be superseded by a framework-agnostic approach soon.
+    """
+
+    args: GlueDataTrainingArguments
+    output_mode: str
+    features: List[InputFeatures]
+
+    def __init__(
+        self,
+        args: GlueDataTrainingArguments,
+        tokenizer: PreTrainedTokenizerBase,
+        limit_length: Optional[int] = None,
+        mode: Union[str, Split] = Split.train,
+        cache_dir: Optional[str] = None,
+    ):
+        warnings.warn(
+            "This dataset will be removed from the library soon, preprocessing should be handled with the 🤗 Datasets "
+            "library. You can have a look at this example script for pointers: "
+            "https://github.com/huggingface/transformers/blob/main/examples/pytorch/text-classification/run_glue.py",
+            FutureWarning,
+        )
+        self.args = args
+        self.processor = glue_processors[args.task_name]()
+        self.output_mode = glue_output_modes[args.task_name]
+        if isinstance(mode, str):
+            try:
+                mode = Split[mode]
+            except KeyError:
+                raise KeyError("mode is not a valid split name")
+        # Load data features from cache or dataset file
+        cached_features_file = os.path.join(
+            cache_dir if cache_dir is not None else args.data_dir,
+            f"cached_{mode.value}_{tokenizer.__class__.__name__}_{args.max_seq_length}_{args.task_name}",
+        )
+        label_list = self.processor.get_labels()
+        if args.task_name in ["mnli", "mnli-mm"] and tokenizer.__class__.__name__ in (
+            "RobertaTokenizer",
+            "RobertaTokenizerFast",
+            "XLMRobertaTokenizer",
+            "BartTokenizer",
+            "BartTokenizerFast",
+        ):
+            # HACK(label indices are swapped in RoBERTa pretrained model)
+            label_list[1], label_list[2] = label_list[2], label_list[1]
+        self.label_list = label_list
+
+        # Make sure only the first process in distributed training processes the dataset,
+        # and the others will use the cache.
+        lock_path = cached_features_file + ".lock"
+        with FileLock(lock_path):
+            if os.path.exists(cached_features_file) and not args.overwrite_cache:
+                start = time.time()
+                self.features = torch.load(cached_features_file)
+                logger.info(
+                    f"Loading features from cached file {cached_features_file} [took %.3f s]", time.time() - start
+                )
+            else:
+                logger.info(f"Creating features from dataset file at {args.data_dir}")
+
+                if mode == Split.dev:
+                    examples = self.processor.get_dev_examples(args.data_dir)
+                elif mode == Split.test:
+                    examples = self.processor.get_test_examples(args.data_dir)
+                else:
+                    examples = self.processor.get_train_examples(args.data_dir)
+                if limit_length is not None:
+                    examples = examples[:limit_length]
+                self.features = glue_convert_examples_to_features(
+                    examples,
+                    tokenizer,
+                    max_length=args.max_seq_length,
+                    label_list=label_list,
+                    output_mode=self.output_mode,
+                )
+                start = time.time()
+                torch.save(self.features, cached_features_file)
+                # ^ This seems to take a lot of time so I want to investigate why and how we can improve.
+                logger.info(
+                    f"Saving features into cached file {cached_features_file} [took {time.time() - start:.3f} s]"
+                )
+
+    def __len__(self):
+        return len(self.features)
+
+    def __getitem__(self, i) -> InputFeatures:
+        return self.features[i]
+
+    def get_labels(self):
+        return self.label_list
diff --git a/.venv/lib/python3.11/site-packages/transformers/data/datasets/language_modeling.py b/.venv/lib/python3.11/site-packages/transformers/data/datasets/language_modeling.py
new file mode 100644
index 0000000000000000000000000000000000000000..6c23bf23cf14d4953a278dd3584093d0af084133
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/data/datasets/language_modeling.py
@@ -0,0 +1,530 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+import pickle
+import random
+import time
+import warnings
+from typing import Dict, List, Optional
+
+import torch
+from filelock import FileLock
+from torch.utils.data import Dataset
+
+from ...tokenization_utils import PreTrainedTokenizer
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+DEPRECATION_WARNING = (
+    "This dataset will be removed from the library soon, preprocessing should be handled with the 🤗 Datasets "
+    "library. You can have a look at this example script for pointers: {0}"
+)
+
+
+class TextDataset(Dataset):
+    """
+    This will be superseded by a framework-agnostic approach soon.
+    """
+
+    def __init__(
+        self,
+        tokenizer: PreTrainedTokenizer,
+        file_path: str,
+        block_size: int,
+        overwrite_cache=False,
+        cache_dir: Optional[str] = None,
+    ):
+        warnings.warn(
+            DEPRECATION_WARNING.format(
+                "https://github.com/huggingface/transformers/blob/main/examples/pytorch/language-modeling/run_mlm.py"
+            ),
+            FutureWarning,
+        )
+        if os.path.isfile(file_path) is False:
+            raise ValueError(f"Input file path {file_path} not found")
+
+        block_size = block_size - tokenizer.num_special_tokens_to_add(pair=False)
+
+        directory, filename = os.path.split(file_path)
+        cached_features_file = os.path.join(
+            cache_dir if cache_dir is not None else directory,
+            f"cached_lm_{tokenizer.__class__.__name__}_{block_size}_{filename}",
+        )
+
+        # Make sure only the first process in distributed training processes the dataset,
+        # and the others will use the cache.
+        lock_path = cached_features_file + ".lock"
+        with FileLock(lock_path):
+            if os.path.exists(cached_features_file) and not overwrite_cache:
+                start = time.time()
+                with open(cached_features_file, "rb") as handle:
+                    self.examples = pickle.load(handle)
+                logger.info(
+                    f"Loading features from cached file {cached_features_file} [took %.3f s]", time.time() - start
+                )
+
+            else:
+                logger.info(f"Creating features from dataset file at {directory}")
+
+                self.examples = []
+                with open(file_path, encoding="utf-8") as f:
+                    text = f.read()
+
+                tokenized_text = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text))
+
+                for i in range(0, len(tokenized_text) - block_size + 1, block_size):  # Truncate in block of block_size
+                    self.examples.append(
+                        tokenizer.build_inputs_with_special_tokens(tokenized_text[i : i + block_size])
+                    )
+                # Note that we are losing the last truncated example here for the sake of simplicity (no padding)
+                # If your dataset is small, first you should look for a bigger one :-) and second you
+                # can change this behavior by adding (model specific) padding.
+
+                start = time.time()
+                with open(cached_features_file, "wb") as handle:
+                    pickle.dump(self.examples, handle, protocol=pickle.HIGHEST_PROTOCOL)
+                logger.info(
+                    f"Saving features into cached file {cached_features_file} [took {time.time() - start:.3f} s]"
+                )
+
+    def __len__(self):
+        return len(self.examples)
+
+    def __getitem__(self, i) -> torch.Tensor:
+        return torch.tensor(self.examples[i], dtype=torch.long)
+
+
+class LineByLineTextDataset(Dataset):
+    """
+    This will be superseded by a framework-agnostic approach soon.
+    """
+
+    def __init__(self, tokenizer: PreTrainedTokenizer, file_path: str, block_size: int):
+        warnings.warn(
+            DEPRECATION_WARNING.format(
+                "https://github.com/huggingface/transformers/blob/main/examples/pytorch/language-modeling/run_mlm.py"
+            ),
+            FutureWarning,
+        )
+        if os.path.isfile(file_path) is False:
+            raise ValueError(f"Input file path {file_path} not found")
+        # Here, we do not cache the features, operating under the assumption
+        # that we will soon use fast multithreaded tokenizers from the
+        # `tokenizers` repo everywhere =)
+        logger.info(f"Creating features from dataset file at {file_path}")
+
+        with open(file_path, encoding="utf-8") as f:
+            lines = [line for line in f.read().splitlines() if (len(line) > 0 and not line.isspace())]
+
+        batch_encoding = tokenizer(lines, add_special_tokens=True, truncation=True, max_length=block_size)
+        self.examples = batch_encoding["input_ids"]
+        self.examples = [{"input_ids": torch.tensor(e, dtype=torch.long)} for e in self.examples]
+
+    def __len__(self):
+        return len(self.examples)
+
+    def __getitem__(self, i) -> Dict[str, torch.tensor]:
+        return self.examples[i]
+
+
+class LineByLineWithRefDataset(Dataset):
+    """
+    This will be superseded by a framework-agnostic approach soon.
+    """
+
+    def __init__(self, tokenizer: PreTrainedTokenizer, file_path: str, block_size: int, ref_path: str):
+        warnings.warn(
+            DEPRECATION_WARNING.format(
+                "https://github.com/huggingface/transformers/blob/main/examples/pytorch/language-modeling/run_mlm_wwm.py"
+            ),
+            FutureWarning,
+        )
+        if os.path.isfile(file_path) is False:
+            raise ValueError(f"Input file path {file_path} not found")
+        if os.path.isfile(ref_path) is False:
+            raise ValueError(f"Ref file path {file_path} not found")
+        # Here, we do not cache the features, operating under the assumption
+        # that we will soon use fast multithreaded tokenizers from the
+        # `tokenizers` repo everywhere =)
+        logger.info(f"Creating features from dataset file at {file_path}")
+        logger.info(f"Use ref segment results at {ref_path}")
+        with open(file_path, encoding="utf-8") as f:
+            data = f.readlines()  # use this method to avoid delimiter '\u2029' to split a line
+        data = [line.strip() for line in data if len(line) > 0 and not line.isspace()]
+        # Get ref inf from file
+        with open(ref_path, encoding="utf-8") as f:
+            ref = [json.loads(line) for line in f.read().splitlines() if (len(line) > 0 and not line.isspace())]
+        if len(data) != len(ref):
+            raise ValueError(
+                f"Length of Input file should be equal to Ref file. But the length of {file_path} is {len(data)} "
+                f"while length of {ref_path} is {len(ref)}"
+            )
+
+        batch_encoding = tokenizer(data, add_special_tokens=True, truncation=True, max_length=block_size)
+        self.examples = batch_encoding["input_ids"]
+        self.examples = [{"input_ids": torch.tensor(e, dtype=torch.long)} for e in self.examples]
+
+        n = len(self.examples)
+        for i in range(n):
+            self.examples[i]["chinese_ref"] = torch.tensor(ref[i], dtype=torch.long)
+
+    def __len__(self):
+        return len(self.examples)
+
+    def __getitem__(self, i) -> Dict[str, torch.tensor]:
+        return self.examples[i]
+
+
+class LineByLineWithSOPTextDataset(Dataset):
+    """
+    Dataset for sentence order prediction task, prepare sentence pairs for SOP task
+    """
+
+    def __init__(self, tokenizer: PreTrainedTokenizer, file_dir: str, block_size: int):
+        warnings.warn(
+            DEPRECATION_WARNING.format(
+                "https://github.com/huggingface/transformers/blob/main/examples/pytorch/language-modeling/run_mlm.py"
+            ),
+            FutureWarning,
+        )
+        if os.path.isdir(file_dir) is False:
+            raise ValueError(f"{file_dir} is not a directory")
+        logger.info(f"Creating features from dataset file folder at {file_dir}")
+        self.examples = []
+        # TODO: randomness could apply a random seed, ex. rng = random.Random(random_seed)
+        # file path looks like ./dataset/wiki_1, ./dataset/wiki_2
+        for file_name in os.listdir(file_dir):
+            file_path = os.path.join(file_dir, file_name)
+            if os.path.isfile(file_path) is False:
+                raise ValueError(f"{file_path} is not a file")
+            article_open = False
+            with open(file_path, encoding="utf-8") as f:
+                original_lines = f.readlines()
+                article_lines = []
+                for line in original_lines:
+                    if "<doc id=" in line:
+                        article_open = True
+                    elif "</doc>" in line:
+                        article_open = False
+                        document = [
+                            tokenizer.convert_tokens_to_ids(tokenizer.tokenize(line))
+                            for line in article_lines[1:]
+                            if (len(line) > 0 and not line.isspace())
+                        ]
+
+                        examples = self.create_examples_from_document(document, block_size, tokenizer)
+                        self.examples.extend(examples)
+                        article_lines = []
+                    else:
+                        if article_open:
+                            article_lines.append(line)
+
+        logger.info("Dataset parse finished.")
+
+    def create_examples_from_document(self, document, block_size, tokenizer, short_seq_prob=0.1):
+        """Creates examples for a single document."""
+
+        # Account for special tokens
+        max_num_tokens = block_size - tokenizer.num_special_tokens_to_add(pair=True)
+
+        # We *usually* want to fill up the entire sequence since we are padding
+        # to `block_size` anyways, so short sequences are generally wasted
+        # computation. However, we *sometimes*
+        # (i.e., short_seq_prob == 0.1 == 10% of the time) want to use shorter
+        # sequences to minimize the mismatch between pretraining and fine-tuning.
+        # The `target_seq_length` is just a rough target however, whereas
+        # `block_size` is a hard limit.
+        target_seq_length = max_num_tokens
+        if random.random() < short_seq_prob:
+            target_seq_length = random.randint(2, max_num_tokens)
+
+        # We DON'T just concatenate all of the tokens from a document into a long
+        # sequence and choose an arbitrary split point because this would make the
+        # next sentence prediction task too easy. Instead, we split the input into
+        # segments "A" and "B" based on the actual "sentences" provided by the user
+        # input.
+        examples = []
+        current_chunk = []  # a buffer stored current working segments
+        current_length = 0
+        i = 0
+        while i < len(document):
+            segment = document[i]  # get a segment
+            if not segment:
+                i += 1
+                continue
+            current_chunk.append(segment)  # add a segment to current chunk
+            current_length += len(segment)  # overall token length
+            # if current length goes to the target length or reaches the end of file, start building token a and b
+            if i == len(document) - 1 or current_length >= target_seq_length:
+                if current_chunk:
+                    # `a_end` is how many segments from `current_chunk` go into the `A` (first) sentence.
+                    a_end = 1
+                    # if current chunk has more than 2 sentences, pick part of it `A` (first) sentence
+                    if len(current_chunk) >= 2:
+                        a_end = random.randint(1, len(current_chunk) - 1)
+                    # token a
+                    tokens_a = []
+                    for j in range(a_end):
+                        tokens_a.extend(current_chunk[j])
+
+                    # token b
+                    tokens_b = []
+                    for j in range(a_end, len(current_chunk)):
+                        tokens_b.extend(current_chunk[j])
+
+                    if len(tokens_a) == 0 or len(tokens_b) == 0:
+                        continue
+
+                    # switch tokens_a and tokens_b randomly
+                    if random.random() < 0.5:
+                        is_next = False
+                        tokens_a, tokens_b = tokens_b, tokens_a
+                    else:
+                        is_next = True
+
+                    def truncate_seq_pair(tokens_a, tokens_b, max_num_tokens):
+                        """Truncates a pair of sequences to a maximum sequence length."""
+                        while True:
+                            total_length = len(tokens_a) + len(tokens_b)
+                            if total_length <= max_num_tokens:
+                                break
+                            trunc_tokens = tokens_a if len(tokens_a) > len(tokens_b) else tokens_b
+                            if not (len(trunc_tokens) >= 1):
+                                raise ValueError("Sequence length to be truncated must be no less than one")
+                            # We want to sometimes truncate from the front and sometimes from the
+                            # back to add more randomness and avoid biases.
+                            if random.random() < 0.5:
+                                del trunc_tokens[0]
+                            else:
+                                trunc_tokens.pop()
+
+                    truncate_seq_pair(tokens_a, tokens_b, max_num_tokens)
+                    if not (len(tokens_a) >= 1):
+                        raise ValueError(f"Length of sequence a is {len(tokens_a)} which must be no less than 1")
+                    if not (len(tokens_b) >= 1):
+                        raise ValueError(f"Length of sequence b is {len(tokens_b)} which must be no less than 1")
+
+                    # add special tokens
+                    input_ids = tokenizer.build_inputs_with_special_tokens(tokens_a, tokens_b)
+                    # add token type ids, 0 for sentence a, 1 for sentence b
+                    token_type_ids = tokenizer.create_token_type_ids_from_sequences(tokens_a, tokens_b)
+
+                    example = {
+                        "input_ids": torch.tensor(input_ids, dtype=torch.long),
+                        "token_type_ids": torch.tensor(token_type_ids, dtype=torch.long),
+                        "sentence_order_label": torch.tensor(0 if is_next else 1, dtype=torch.long),
+                    }
+                    examples.append(example)
+                current_chunk = []  # clear current chunk
+                current_length = 0  # reset current text length
+            i += 1  # go to next line
+        return examples
+
+    def __len__(self):
+        return len(self.examples)
+
+    def __getitem__(self, i) -> Dict[str, torch.tensor]:
+        return self.examples[i]
+
+
+class TextDatasetForNextSentencePrediction(Dataset):
+    """
+    This will be superseded by a framework-agnostic approach soon.
+    """
+
+    def __init__(
+        self,
+        tokenizer: PreTrainedTokenizer,
+        file_path: str,
+        block_size: int,
+        overwrite_cache=False,
+        short_seq_probability=0.1,
+        nsp_probability=0.5,
+    ):
+        warnings.warn(
+            DEPRECATION_WARNING.format(
+                "https://github.com/huggingface/transformers/blob/main/examples/pytorch/language-modeling/run_mlm.py"
+            ),
+            FutureWarning,
+        )
+        if not os.path.isfile(file_path):
+            raise ValueError(f"Input file path {file_path} not found")
+
+        self.short_seq_probability = short_seq_probability
+        self.nsp_probability = nsp_probability
+
+        directory, filename = os.path.split(file_path)
+        cached_features_file = os.path.join(
+            directory,
+            f"cached_nsp_{tokenizer.__class__.__name__}_{block_size}_{filename}",
+        )
+
+        self.tokenizer = tokenizer
+
+        # Make sure only the first process in distributed training processes the dataset,
+        # and the others will use the cache.
+        lock_path = cached_features_file + ".lock"
+
+        # Input file format:
+        # (1) One sentence per line. These should ideally be actual sentences, not
+        # entire paragraphs or arbitrary spans of text. (Because we use the
+        # sentence boundaries for the "next sentence prediction" task).
+        # (2) Blank lines between documents. Document boundaries are needed so
+        # that the "next sentence prediction" task doesn't span between documents.
+        #
+        # Example:
+        # I am very happy.
+        # Here is the second sentence.
+        #
+        # A new document.
+
+        with FileLock(lock_path):
+            if os.path.exists(cached_features_file) and not overwrite_cache:
+                start = time.time()
+                with open(cached_features_file, "rb") as handle:
+                    self.examples = pickle.load(handle)
+                logger.info(
+                    f"Loading features from cached file {cached_features_file} [took %.3f s]", time.time() - start
+                )
+            else:
+                logger.info(f"Creating features from dataset file at {directory}")
+
+                self.documents = [[]]
+                with open(file_path, encoding="utf-8") as f:
+                    while True:
+                        line = f.readline()
+                        if not line:
+                            break
+                        line = line.strip()
+
+                        # Empty lines are used as document delimiters
+                        if not line and len(self.documents[-1]) != 0:
+                            self.documents.append([])
+                        tokens = tokenizer.tokenize(line)
+                        tokens = tokenizer.convert_tokens_to_ids(tokens)
+                        if tokens:
+                            self.documents[-1].append(tokens)
+
+                logger.info(f"Creating examples from {len(self.documents)} documents.")
+                self.examples = []
+                for doc_index, document in enumerate(self.documents):
+                    self.create_examples_from_document(document, doc_index, block_size)
+
+                start = time.time()
+                with open(cached_features_file, "wb") as handle:
+                    pickle.dump(self.examples, handle, protocol=pickle.HIGHEST_PROTOCOL)
+                logger.info(
+                    f"Saving features into cached file {cached_features_file} [took {time.time() - start:.3f} s]"
+                )
+
+    def create_examples_from_document(self, document: List[List[int]], doc_index: int, block_size: int):
+        """Creates examples for a single document."""
+
+        max_num_tokens = block_size - self.tokenizer.num_special_tokens_to_add(pair=True)
+
+        # We *usually* want to fill up the entire sequence since we are padding
+        # to `block_size` anyways, so short sequences are generally wasted
+        # computation. However, we *sometimes*
+        # (i.e., short_seq_prob == 0.1 == 10% of the time) want to use shorter
+        # sequences to minimize the mismatch between pretraining and fine-tuning.
+        # The `target_seq_length` is just a rough target however, whereas
+        # `block_size` is a hard limit.
+        target_seq_length = max_num_tokens
+        if random.random() < self.short_seq_probability:
+            target_seq_length = random.randint(2, max_num_tokens)
+
+        current_chunk = []  # a buffer stored current working segments
+        current_length = 0
+        i = 0
+
+        while i < len(document):
+            segment = document[i]
+            current_chunk.append(segment)
+            current_length += len(segment)
+            if i == len(document) - 1 or current_length >= target_seq_length:
+                if current_chunk:
+                    # `a_end` is how many segments from `current_chunk` go into the `A`
+                    # (first) sentence.
+                    a_end = 1
+                    if len(current_chunk) >= 2:
+                        a_end = random.randint(1, len(current_chunk) - 1)
+
+                    tokens_a = []
+                    for j in range(a_end):
+                        tokens_a.extend(current_chunk[j])
+
+                    tokens_b = []
+
+                    if len(current_chunk) == 1 or random.random() < self.nsp_probability:
+                        is_random_next = True
+                        target_b_length = target_seq_length - len(tokens_a)
+
+                        # This should rarely go for more than one iteration for large
+                        # corpora. However, just to be careful, we try to make sure that
+                        # the random document is not the same as the document
+                        # we're processing.
+                        for _ in range(10):
+                            random_document_index = random.randint(0, len(self.documents) - 1)
+                            if random_document_index != doc_index:
+                                break
+
+                        random_document = self.documents[random_document_index]
+                        random_start = random.randint(0, len(random_document) - 1)
+                        for j in range(random_start, len(random_document)):
+                            tokens_b.extend(random_document[j])
+                            if len(tokens_b) >= target_b_length:
+                                break
+                        # We didn't actually use these segments so we "put them back" so
+                        # they don't go to waste.
+                        num_unused_segments = len(current_chunk) - a_end
+                        i -= num_unused_segments
+                    # Actual next
+                    else:
+                        is_random_next = False
+                        for j in range(a_end, len(current_chunk)):
+                            tokens_b.extend(current_chunk[j])
+
+                    if not (len(tokens_a) >= 1):
+                        raise ValueError(f"Length of sequence a is {len(tokens_a)} which must be no less than 1")
+                    if not (len(tokens_b) >= 1):
+                        raise ValueError(f"Length of sequence b is {len(tokens_b)} which must be no less than 1")
+
+                    # add special tokens
+                    input_ids = self.tokenizer.build_inputs_with_special_tokens(tokens_a, tokens_b)
+                    # add token type ids, 0 for sentence a, 1 for sentence b
+                    token_type_ids = self.tokenizer.create_token_type_ids_from_sequences(tokens_a, tokens_b)
+
+                    example = {
+                        "input_ids": torch.tensor(input_ids, dtype=torch.long),
+                        "token_type_ids": torch.tensor(token_type_ids, dtype=torch.long),
+                        "next_sentence_label": torch.tensor(1 if is_random_next else 0, dtype=torch.long),
+                    }
+
+                    self.examples.append(example)
+
+                current_chunk = []
+                current_length = 0
+
+            i += 1
+
+    def __len__(self):
+        return len(self.examples)
+
+    def __getitem__(self, i):
+        return self.examples[i]
diff --git a/.venv/lib/python3.11/site-packages/transformers/data/datasets/squad.py b/.venv/lib/python3.11/site-packages/transformers/data/datasets/squad.py
new file mode 100644
index 0000000000000000000000000000000000000000..d81217d818afff5e297e6992d979847cf7c0f4cc
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/data/datasets/squad.py
@@ -0,0 +1,229 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import time
+from dataclasses import dataclass, field
+from enum import Enum
+from typing import Dict, List, Optional, Union
+
+import torch
+from filelock import FileLock
+from torch.utils.data import Dataset
+
+from ...models.auto.modeling_auto import MODEL_FOR_QUESTION_ANSWERING_MAPPING
+from ...tokenization_utils import PreTrainedTokenizer
+from ...utils import logging
+from ..processors.squad import SquadFeatures, SquadV1Processor, SquadV2Processor, squad_convert_examples_to_features
+
+
+logger = logging.get_logger(__name__)
+
+MODEL_CONFIG_CLASSES = list(MODEL_FOR_QUESTION_ANSWERING_MAPPING.keys())
+MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
+
+
+@dataclass
+class SquadDataTrainingArguments:
+    """
+    Arguments pertaining to what data we are going to input our model for training and eval.
+    """
+
+    model_type: str = field(
+        default=None, metadata={"help": "Model type selected in the list: " + ", ".join(MODEL_TYPES)}
+    )
+    data_dir: str = field(
+        default=None, metadata={"help": "The input data dir. Should contain the .json files for the SQuAD task."}
+    )
+    max_seq_length: int = field(
+        default=128,
+        metadata={
+            "help": (
+                "The maximum total input sequence length after tokenization. Sequences longer "
+                "than this will be truncated, sequences shorter will be padded."
+            )
+        },
+    )
+    doc_stride: int = field(
+        default=128,
+        metadata={"help": "When splitting up a long document into chunks, how much stride to take between chunks."},
+    )
+    max_query_length: int = field(
+        default=64,
+        metadata={
+            "help": (
+                "The maximum number of tokens for the question. Questions longer than this will "
+                "be truncated to this length."
+            )
+        },
+    )
+    max_answer_length: int = field(
+        default=30,
+        metadata={
+            "help": (
+                "The maximum length of an answer that can be generated. This is needed because the start "
+                "and end predictions are not conditioned on one another."
+            )
+        },
+    )
+    overwrite_cache: bool = field(
+        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
+    )
+    version_2_with_negative: bool = field(
+        default=False, metadata={"help": "If true, the SQuAD examples contain some that do not have an answer."}
+    )
+    null_score_diff_threshold: float = field(
+        default=0.0, metadata={"help": "If null_score - best_non_null is greater than the threshold predict null."}
+    )
+    n_best_size: int = field(
+        default=20, metadata={"help": "If null_score - best_non_null is greater than the threshold predict null."}
+    )
+    lang_id: int = field(
+        default=0,
+        metadata={
+            "help": (
+                "language id of input for language-specific xlm models (see"
+                " tokenization_xlm.PRETRAINED_INIT_CONFIGURATION)"
+            )
+        },
+    )
+    threads: int = field(default=1, metadata={"help": "multiple threads for converting example to features"})
+
+
+class Split(Enum):
+    train = "train"
+    dev = "dev"
+
+
+class SquadDataset(Dataset):
+    """
+    This will be superseded by a framework-agnostic approach soon.
+    """
+
+    args: SquadDataTrainingArguments
+    features: List[SquadFeatures]
+    mode: Split
+    is_language_sensitive: bool
+
+    def __init__(
+        self,
+        args: SquadDataTrainingArguments,
+        tokenizer: PreTrainedTokenizer,
+        limit_length: Optional[int] = None,
+        mode: Union[str, Split] = Split.train,
+        is_language_sensitive: Optional[bool] = False,
+        cache_dir: Optional[str] = None,
+        dataset_format: Optional[str] = "pt",
+    ):
+        self.args = args
+        self.is_language_sensitive = is_language_sensitive
+        self.processor = SquadV2Processor() if args.version_2_with_negative else SquadV1Processor()
+        if isinstance(mode, str):
+            try:
+                mode = Split[mode]
+            except KeyError:
+                raise KeyError("mode is not a valid split name")
+        self.mode = mode
+        # Load data features from cache or dataset file
+        version_tag = "v2" if args.version_2_with_negative else "v1"
+        cached_features_file = os.path.join(
+            cache_dir if cache_dir is not None else args.data_dir,
+            f"cached_{mode.value}_{tokenizer.__class__.__name__}_{args.max_seq_length}_{version_tag}",
+        )
+
+        # Make sure only the first process in distributed training processes the dataset,
+        # and the others will use the cache.
+        lock_path = cached_features_file + ".lock"
+        with FileLock(lock_path):
+            if os.path.exists(cached_features_file) and not args.overwrite_cache:
+                start = time.time()
+                self.old_features = torch.load(cached_features_file)
+
+                # Legacy cache files have only features, while new cache files
+                # will have dataset and examples also.
+                self.features = self.old_features["features"]
+                self.dataset = self.old_features.get("dataset", None)
+                self.examples = self.old_features.get("examples", None)
+                logger.info(
+                    f"Loading features from cached file {cached_features_file} [took %.3f s]", time.time() - start
+                )
+
+                if self.dataset is None or self.examples is None:
+                    logger.warning(
+                        f"Deleting cached file {cached_features_file} will allow dataset and examples to be cached in"
+                        " future run"
+                    )
+            else:
+                if mode == Split.dev:
+                    self.examples = self.processor.get_dev_examples(args.data_dir)
+                else:
+                    self.examples = self.processor.get_train_examples(args.data_dir)
+
+                self.features, self.dataset = squad_convert_examples_to_features(
+                    examples=self.examples,
+                    tokenizer=tokenizer,
+                    max_seq_length=args.max_seq_length,
+                    doc_stride=args.doc_stride,
+                    max_query_length=args.max_query_length,
+                    is_training=mode == Split.train,
+                    threads=args.threads,
+                    return_dataset=dataset_format,
+                )
+
+                start = time.time()
+                torch.save(
+                    {"features": self.features, "dataset": self.dataset, "examples": self.examples},
+                    cached_features_file,
+                )
+                # ^ This seems to take a lot of time so I want to investigate why and how we can improve.
+                logger.info(
+                    f"Saving features into cached file {cached_features_file} [took {time.time() - start:.3f} s]"
+                )
+
+    def __len__(self):
+        return len(self.features)
+
+    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
+        # Convert to Tensors and build dataset
+        feature = self.features[i]
+
+        input_ids = torch.tensor(feature.input_ids, dtype=torch.long)
+        attention_mask = torch.tensor(feature.attention_mask, dtype=torch.long)
+        token_type_ids = torch.tensor(feature.token_type_ids, dtype=torch.long)
+        cls_index = torch.tensor(feature.cls_index, dtype=torch.long)
+        p_mask = torch.tensor(feature.p_mask, dtype=torch.float)
+        is_impossible = torch.tensor(feature.is_impossible, dtype=torch.float)
+
+        inputs = {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "token_type_ids": token_type_ids,
+        }
+
+        if self.args.model_type in ["xlm", "roberta", "distilbert", "camembert"]:
+            del inputs["token_type_ids"]
+
+        if self.args.model_type in ["xlnet", "xlm"]:
+            inputs.update({"cls_index": cls_index, "p_mask": p_mask})
+            if self.args.version_2_with_negative:
+                inputs.update({"is_impossible": is_impossible})
+            if self.is_language_sensitive:
+                inputs.update({"langs": (torch.ones(input_ids.shape, dtype=torch.int64) * self.args.lang_id)})
+
+        if self.mode == Split.train:
+            start_positions = torch.tensor(feature.start_position, dtype=torch.long)
+            end_positions = torch.tensor(feature.end_position, dtype=torch.long)
+            inputs.update({"start_positions": start_positions, "end_positions": end_positions})
+
+        return inputs
diff --git a/.venv/lib/python3.11/site-packages/transformers/data/metrics/__init__.py b/.venv/lib/python3.11/site-packages/transformers/data/metrics/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ebd0d17aa55bb4529820ce347f6275d38f6c0caa
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/data/metrics/__init__.py
@@ -0,0 +1,98 @@
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import warnings
+
+from ...utils import is_sklearn_available, requires_backends
+
+
+if is_sklearn_available():
+    from scipy.stats import pearsonr, spearmanr
+    from sklearn.metrics import f1_score, matthews_corrcoef
+
+
+DEPRECATION_WARNING = (
+    "This metric will be removed from the library soon, metrics should be handled with the 🤗 Evaluate "
+    "library. You can have a look at this example script for pointers: "
+    "https://github.com/huggingface/transformers/blob/main/examples/pytorch/text-classification/run_glue.py"
+)
+
+
+def simple_accuracy(preds, labels):
+    warnings.warn(DEPRECATION_WARNING, FutureWarning)
+    requires_backends(simple_accuracy, "sklearn")
+    return (preds == labels).mean()
+
+
+def acc_and_f1(preds, labels):
+    warnings.warn(DEPRECATION_WARNING, FutureWarning)
+    requires_backends(acc_and_f1, "sklearn")
+    acc = simple_accuracy(preds, labels)
+    f1 = f1_score(y_true=labels, y_pred=preds)
+    return {
+        "acc": acc,
+        "f1": f1,
+        "acc_and_f1": (acc + f1) / 2,
+    }
+
+
+def pearson_and_spearman(preds, labels):
+    warnings.warn(DEPRECATION_WARNING, FutureWarning)
+    requires_backends(pearson_and_spearman, "sklearn")
+    pearson_corr = pearsonr(preds, labels)[0]
+    spearman_corr = spearmanr(preds, labels)[0]
+    return {
+        "pearson": pearson_corr,
+        "spearmanr": spearman_corr,
+        "corr": (pearson_corr + spearman_corr) / 2,
+    }
+
+
+def glue_compute_metrics(task_name, preds, labels):
+    warnings.warn(DEPRECATION_WARNING, FutureWarning)
+    requires_backends(glue_compute_metrics, "sklearn")
+    assert len(preds) == len(labels), f"Predictions and labels have mismatched lengths {len(preds)} and {len(labels)}"
+    if task_name == "cola":
+        return {"mcc": matthews_corrcoef(labels, preds)}
+    elif task_name == "sst-2":
+        return {"acc": simple_accuracy(preds, labels)}
+    elif task_name == "mrpc":
+        return acc_and_f1(preds, labels)
+    elif task_name == "sts-b":
+        return pearson_and_spearman(preds, labels)
+    elif task_name == "qqp":
+        return acc_and_f1(preds, labels)
+    elif task_name == "mnli":
+        return {"mnli/acc": simple_accuracy(preds, labels)}
+    elif task_name == "mnli-mm":
+        return {"mnli-mm/acc": simple_accuracy(preds, labels)}
+    elif task_name == "qnli":
+        return {"acc": simple_accuracy(preds, labels)}
+    elif task_name == "rte":
+        return {"acc": simple_accuracy(preds, labels)}
+    elif task_name == "wnli":
+        return {"acc": simple_accuracy(preds, labels)}
+    elif task_name == "hans":
+        return {"acc": simple_accuracy(preds, labels)}
+    else:
+        raise KeyError(task_name)
+
+
+def xnli_compute_metrics(task_name, preds, labels):
+    warnings.warn(DEPRECATION_WARNING, FutureWarning)
+    requires_backends(xnli_compute_metrics, "sklearn")
+    if len(preds) != len(labels):
+        raise ValueError(f"Predictions and labels have mismatched lengths {len(preds)} and {len(labels)}")
+    if task_name == "xnli":
+        return {"acc": simple_accuracy(preds, labels)}
+    else:
+        raise KeyError(task_name)
diff --git a/.venv/lib/python3.11/site-packages/transformers/data/metrics/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/data/metrics/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..268f3763374e5c6d88b606baa1b404ca8e74aa62
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/data/metrics/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/data/metrics/__pycache__/squad_metrics.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/data/metrics/__pycache__/squad_metrics.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b9f9b4cc3ec5395a32b4846d2a99651570a6a272
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/data/metrics/__pycache__/squad_metrics.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/data/metrics/squad_metrics.py b/.venv/lib/python3.11/site-packages/transformers/data/metrics/squad_metrics.py
new file mode 100644
index 0000000000000000000000000000000000000000..5d98a0bfcf15c8941ced8de2f778dab5bbb57e00
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/data/metrics/squad_metrics.py
@@ -0,0 +1,779 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Very heavily inspired by the official evaluation script for SQuAD version 2.0 which was modified by XLNet authors to
+update `find_best_threshold` scripts for SQuAD V2.0
+
+In addition to basic functionality, we also compute additional statistics and plot precision-recall curves if an
+additional na_prob.json file is provided. This file is expected to map question ID's to the model's predicted
+probability that a question is unanswerable.
+"""
+
+import collections
+import json
+import math
+import re
+import string
+
+from ...models.bert import BasicTokenizer
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+def normalize_answer(s):
+    """Lower text and remove punctuation, articles and extra whitespace."""
+
+    def remove_articles(text):
+        regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
+        return re.sub(regex, " ", text)
+
+    def white_space_fix(text):
+        return " ".join(text.split())
+
+    def remove_punc(text):
+        exclude = set(string.punctuation)
+        return "".join(ch for ch in text if ch not in exclude)
+
+    def lower(text):
+        return text.lower()
+
+    return white_space_fix(remove_articles(remove_punc(lower(s))))
+
+
+def get_tokens(s):
+    if not s:
+        return []
+    return normalize_answer(s).split()
+
+
+def compute_exact(a_gold, a_pred):
+    return int(normalize_answer(a_gold) == normalize_answer(a_pred))
+
+
+def compute_f1(a_gold, a_pred):
+    gold_toks = get_tokens(a_gold)
+    pred_toks = get_tokens(a_pred)
+    common = collections.Counter(gold_toks) & collections.Counter(pred_toks)
+    num_same = sum(common.values())
+    if len(gold_toks) == 0 or len(pred_toks) == 0:
+        # If either is no-answer, then F1 is 1 if they agree, 0 otherwise
+        return int(gold_toks == pred_toks)
+    if num_same == 0:
+        return 0
+    precision = 1.0 * num_same / len(pred_toks)
+    recall = 1.0 * num_same / len(gold_toks)
+    f1 = (2 * precision * recall) / (precision + recall)
+    return f1
+
+
+def get_raw_scores(examples, preds):
+    """
+    Computes the exact and f1 scores from the examples and the model predictions
+    """
+    exact_scores = {}
+    f1_scores = {}
+
+    for example in examples:
+        qas_id = example.qas_id
+        gold_answers = [answer["text"] for answer in example.answers if normalize_answer(answer["text"])]
+
+        if not gold_answers:
+            # For unanswerable questions, only correct answer is empty string
+            gold_answers = [""]
+
+        if qas_id not in preds:
+            print(f"Missing prediction for {qas_id}")
+            continue
+
+        prediction = preds[qas_id]
+        exact_scores[qas_id] = max(compute_exact(a, prediction) for a in gold_answers)
+        f1_scores[qas_id] = max(compute_f1(a, prediction) for a in gold_answers)
+
+    return exact_scores, f1_scores
+
+
+def apply_no_ans_threshold(scores, na_probs, qid_to_has_ans, na_prob_thresh):
+    new_scores = {}
+    for qid, s in scores.items():
+        pred_na = na_probs[qid] > na_prob_thresh
+        if pred_na:
+            new_scores[qid] = float(not qid_to_has_ans[qid])
+        else:
+            new_scores[qid] = s
+    return new_scores
+
+
+def make_eval_dict(exact_scores, f1_scores, qid_list=None):
+    if not qid_list:
+        total = len(exact_scores)
+        return collections.OrderedDict(
+            [
+                ("exact", 100.0 * sum(exact_scores.values()) / total),
+                ("f1", 100.0 * sum(f1_scores.values()) / total),
+                ("total", total),
+            ]
+        )
+    else:
+        total = len(qid_list)
+        return collections.OrderedDict(
+            [
+                ("exact", 100.0 * sum(exact_scores[k] for k in qid_list) / total),
+                ("f1", 100.0 * sum(f1_scores[k] for k in qid_list) / total),
+                ("total", total),
+            ]
+        )
+
+
+def merge_eval(main_eval, new_eval, prefix):
+    for k in new_eval:
+        main_eval[f"{prefix}_{k}"] = new_eval[k]
+
+
+def find_best_thresh_v2(preds, scores, na_probs, qid_to_has_ans):
+    num_no_ans = sum(1 for k in qid_to_has_ans if not qid_to_has_ans[k])
+    cur_score = num_no_ans
+    best_score = cur_score
+    best_thresh = 0.0
+    qid_list = sorted(na_probs, key=lambda k: na_probs[k])
+    for i, qid in enumerate(qid_list):
+        if qid not in scores:
+            continue
+        if qid_to_has_ans[qid]:
+            diff = scores[qid]
+        else:
+            if preds[qid]:
+                diff = -1
+            else:
+                diff = 0
+        cur_score += diff
+        if cur_score > best_score:
+            best_score = cur_score
+            best_thresh = na_probs[qid]
+
+    has_ans_score, has_ans_cnt = 0, 0
+    for qid in qid_list:
+        if not qid_to_has_ans[qid]:
+            continue
+        has_ans_cnt += 1
+
+        if qid not in scores:
+            continue
+        has_ans_score += scores[qid]
+
+    return 100.0 * best_score / len(scores), best_thresh, 1.0 * has_ans_score / has_ans_cnt
+
+
+def find_all_best_thresh_v2(main_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans):
+    best_exact, exact_thresh, has_ans_exact = find_best_thresh_v2(preds, exact_raw, na_probs, qid_to_has_ans)
+    best_f1, f1_thresh, has_ans_f1 = find_best_thresh_v2(preds, f1_raw, na_probs, qid_to_has_ans)
+    main_eval["best_exact"] = best_exact
+    main_eval["best_exact_thresh"] = exact_thresh
+    main_eval["best_f1"] = best_f1
+    main_eval["best_f1_thresh"] = f1_thresh
+    main_eval["has_ans_exact"] = has_ans_exact
+    main_eval["has_ans_f1"] = has_ans_f1
+
+
+def find_best_thresh(preds, scores, na_probs, qid_to_has_ans):
+    num_no_ans = sum(1 for k in qid_to_has_ans if not qid_to_has_ans[k])
+    cur_score = num_no_ans
+    best_score = cur_score
+    best_thresh = 0.0
+    qid_list = sorted(na_probs, key=lambda k: na_probs[k])
+    for _, qid in enumerate(qid_list):
+        if qid not in scores:
+            continue
+        if qid_to_has_ans[qid]:
+            diff = scores[qid]
+        else:
+            if preds[qid]:
+                diff = -1
+            else:
+                diff = 0
+        cur_score += diff
+        if cur_score > best_score:
+            best_score = cur_score
+            best_thresh = na_probs[qid]
+    return 100.0 * best_score / len(scores), best_thresh
+
+
+def find_all_best_thresh(main_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans):
+    best_exact, exact_thresh = find_best_thresh(preds, exact_raw, na_probs, qid_to_has_ans)
+    best_f1, f1_thresh = find_best_thresh(preds, f1_raw, na_probs, qid_to_has_ans)
+
+    main_eval["best_exact"] = best_exact
+    main_eval["best_exact_thresh"] = exact_thresh
+    main_eval["best_f1"] = best_f1
+    main_eval["best_f1_thresh"] = f1_thresh
+
+
+def squad_evaluate(examples, preds, no_answer_probs=None, no_answer_probability_threshold=1.0):
+    qas_id_to_has_answer = {example.qas_id: bool(example.answers) for example in examples}
+    has_answer_qids = [qas_id for qas_id, has_answer in qas_id_to_has_answer.items() if has_answer]
+    no_answer_qids = [qas_id for qas_id, has_answer in qas_id_to_has_answer.items() if not has_answer]
+
+    if no_answer_probs is None:
+        no_answer_probs = {k: 0.0 for k in preds}
+
+    exact, f1 = get_raw_scores(examples, preds)
+
+    exact_threshold = apply_no_ans_threshold(
+        exact, no_answer_probs, qas_id_to_has_answer, no_answer_probability_threshold
+    )
+    f1_threshold = apply_no_ans_threshold(f1, no_answer_probs, qas_id_to_has_answer, no_answer_probability_threshold)
+
+    evaluation = make_eval_dict(exact_threshold, f1_threshold)
+
+    if has_answer_qids:
+        has_ans_eval = make_eval_dict(exact_threshold, f1_threshold, qid_list=has_answer_qids)
+        merge_eval(evaluation, has_ans_eval, "HasAns")
+
+    if no_answer_qids:
+        no_ans_eval = make_eval_dict(exact_threshold, f1_threshold, qid_list=no_answer_qids)
+        merge_eval(evaluation, no_ans_eval, "NoAns")
+
+    if no_answer_probs:
+        find_all_best_thresh(evaluation, preds, exact, f1, no_answer_probs, qas_id_to_has_answer)
+
+    return evaluation
+
+
+def get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False):
+    """Project the tokenized prediction back to the original text."""
+
+    # When we created the data, we kept track of the alignment between original
+    # (whitespace tokenized) tokens and our WordPiece tokenized tokens. So
+    # now `orig_text` contains the span of our original text corresponding to the
+    # span that we predicted.
+    #
+    # However, `orig_text` may contain extra characters that we don't want in
+    # our prediction.
+    #
+    # For example, let's say:
+    #   pred_text = steve smith
+    #   orig_text = Steve Smith's
+    #
+    # We don't want to return `orig_text` because it contains the extra "'s".
+    #
+    # We don't want to return `pred_text` because it's already been normalized
+    # (the SQuAD eval script also does punctuation stripping/lower casing but
+    # our tokenizer does additional normalization like stripping accent
+    # characters).
+    #
+    # What we really want to return is "Steve Smith".
+    #
+    # Therefore, we have to apply a semi-complicated alignment heuristic between
+    # `pred_text` and `orig_text` to get a character-to-character alignment. This
+    # can fail in certain cases in which case we just return `orig_text`.
+
+    def _strip_spaces(text):
+        ns_chars = []
+        ns_to_s_map = collections.OrderedDict()
+        for i, c in enumerate(text):
+            if c == " ":
+                continue
+            ns_to_s_map[len(ns_chars)] = i
+            ns_chars.append(c)
+        ns_text = "".join(ns_chars)
+        return (ns_text, ns_to_s_map)
+
+    # We first tokenize `orig_text`, strip whitespace from the result
+    # and `pred_text`, and check if they are the same length. If they are
+    # NOT the same length, the heuristic has failed. If they are the same
+    # length, we assume the characters are one-to-one aligned.
+    tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
+
+    tok_text = " ".join(tokenizer.tokenize(orig_text))
+
+    start_position = tok_text.find(pred_text)
+    if start_position == -1:
+        if verbose_logging:
+            logger.info(f"Unable to find text: '{pred_text}' in '{orig_text}'")
+        return orig_text
+    end_position = start_position + len(pred_text) - 1
+
+    (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text)
+    (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text)
+
+    if len(orig_ns_text) != len(tok_ns_text):
+        if verbose_logging:
+            logger.info(f"Length not equal after stripping spaces: '{orig_ns_text}' vs '{tok_ns_text}'")
+        return orig_text
+
+    # We then project the characters in `pred_text` back to `orig_text` using
+    # the character-to-character alignment.
+    tok_s_to_ns_map = {}
+    for i, tok_index in tok_ns_to_s_map.items():
+        tok_s_to_ns_map[tok_index] = i
+
+    orig_start_position = None
+    if start_position in tok_s_to_ns_map:
+        ns_start_position = tok_s_to_ns_map[start_position]
+        if ns_start_position in orig_ns_to_s_map:
+            orig_start_position = orig_ns_to_s_map[ns_start_position]
+
+    if orig_start_position is None:
+        if verbose_logging:
+            logger.info("Couldn't map start position")
+        return orig_text
+
+    orig_end_position = None
+    if end_position in tok_s_to_ns_map:
+        ns_end_position = tok_s_to_ns_map[end_position]
+        if ns_end_position in orig_ns_to_s_map:
+            orig_end_position = orig_ns_to_s_map[ns_end_position]
+
+    if orig_end_position is None:
+        if verbose_logging:
+            logger.info("Couldn't map end position")
+        return orig_text
+
+    output_text = orig_text[orig_start_position : (orig_end_position + 1)]
+    return output_text
+
+
+def _get_best_indexes(logits, n_best_size):
+    """Get the n-best logits from a list."""
+    index_and_score = sorted(enumerate(logits), key=lambda x: x[1], reverse=True)
+
+    best_indexes = []
+    for i in range(len(index_and_score)):
+        if i >= n_best_size:
+            break
+        best_indexes.append(index_and_score[i][0])
+    return best_indexes
+
+
+def _compute_softmax(scores):
+    """Compute softmax probability over raw logits."""
+    if not scores:
+        return []
+
+    max_score = None
+    for score in scores:
+        if max_score is None or score > max_score:
+            max_score = score
+
+    exp_scores = []
+    total_sum = 0.0
+    for score in scores:
+        x = math.exp(score - max_score)
+        exp_scores.append(x)
+        total_sum += x
+
+    probs = []
+    for score in exp_scores:
+        probs.append(score / total_sum)
+    return probs
+
+
+def compute_predictions_logits(
+    all_examples,
+    all_features,
+    all_results,
+    n_best_size,
+    max_answer_length,
+    do_lower_case,
+    output_prediction_file,
+    output_nbest_file,
+    output_null_log_odds_file,
+    verbose_logging,
+    version_2_with_negative,
+    null_score_diff_threshold,
+    tokenizer,
+):
+    """Write final predictions to the json file and log-odds of null if needed."""
+    if output_prediction_file:
+        logger.info(f"Writing predictions to: {output_prediction_file}")
+    if output_nbest_file:
+        logger.info(f"Writing nbest to: {output_nbest_file}")
+    if output_null_log_odds_file and version_2_with_negative:
+        logger.info(f"Writing null_log_odds to: {output_null_log_odds_file}")
+
+    example_index_to_features = collections.defaultdict(list)
+    for feature in all_features:
+        example_index_to_features[feature.example_index].append(feature)
+
+    unique_id_to_result = {}
+    for result in all_results:
+        unique_id_to_result[result.unique_id] = result
+
+    _PrelimPrediction = collections.namedtuple(  # pylint: disable=invalid-name
+        "PrelimPrediction", ["feature_index", "start_index", "end_index", "start_logit", "end_logit"]
+    )
+
+    all_predictions = collections.OrderedDict()
+    all_nbest_json = collections.OrderedDict()
+    scores_diff_json = collections.OrderedDict()
+
+    for example_index, example in enumerate(all_examples):
+        features = example_index_to_features[example_index]
+
+        prelim_predictions = []
+        # keep track of the minimum score of null start+end of position 0
+        score_null = 1000000  # large and positive
+        min_null_feature_index = 0  # the paragraph slice with min null score
+        null_start_logit = 0  # the start logit at the slice with min null score
+        null_end_logit = 0  # the end logit at the slice with min null score
+        for feature_index, feature in enumerate(features):
+            result = unique_id_to_result[feature.unique_id]
+            start_indexes = _get_best_indexes(result.start_logits, n_best_size)
+            end_indexes = _get_best_indexes(result.end_logits, n_best_size)
+            # if we could have irrelevant answers, get the min score of irrelevant
+            if version_2_with_negative:
+                feature_null_score = result.start_logits[0] + result.end_logits[0]
+                if feature_null_score < score_null:
+                    score_null = feature_null_score
+                    min_null_feature_index = feature_index
+                    null_start_logit = result.start_logits[0]
+                    null_end_logit = result.end_logits[0]
+            for start_index in start_indexes:
+                for end_index in end_indexes:
+                    # We could hypothetically create invalid predictions, e.g., predict
+                    # that the start of the span is in the question. We throw out all
+                    # invalid predictions.
+                    if start_index >= len(feature.tokens):
+                        continue
+                    if end_index >= len(feature.tokens):
+                        continue
+                    if start_index not in feature.token_to_orig_map:
+                        continue
+                    if end_index not in feature.token_to_orig_map:
+                        continue
+                    if not feature.token_is_max_context.get(start_index, False):
+                        continue
+                    if end_index < start_index:
+                        continue
+                    length = end_index - start_index + 1
+                    if length > max_answer_length:
+                        continue
+                    prelim_predictions.append(
+                        _PrelimPrediction(
+                            feature_index=feature_index,
+                            start_index=start_index,
+                            end_index=end_index,
+                            start_logit=result.start_logits[start_index],
+                            end_logit=result.end_logits[end_index],
+                        )
+                    )
+        if version_2_with_negative:
+            prelim_predictions.append(
+                _PrelimPrediction(
+                    feature_index=min_null_feature_index,
+                    start_index=0,
+                    end_index=0,
+                    start_logit=null_start_logit,
+                    end_logit=null_end_logit,
+                )
+            )
+        prelim_predictions = sorted(prelim_predictions, key=lambda x: (x.start_logit + x.end_logit), reverse=True)
+
+        _NbestPrediction = collections.namedtuple(  # pylint: disable=invalid-name
+            "NbestPrediction", ["text", "start_logit", "end_logit"]
+        )
+
+        seen_predictions = {}
+        nbest = []
+        for pred in prelim_predictions:
+            if len(nbest) >= n_best_size:
+                break
+            feature = features[pred.feature_index]
+            if pred.start_index > 0:  # this is a non-null prediction
+                tok_tokens = feature.tokens[pred.start_index : (pred.end_index + 1)]
+                orig_doc_start = feature.token_to_orig_map[pred.start_index]
+                orig_doc_end = feature.token_to_orig_map[pred.end_index]
+                orig_tokens = example.doc_tokens[orig_doc_start : (orig_doc_end + 1)]
+
+                tok_text = tokenizer.convert_tokens_to_string(tok_tokens)
+
+                # tok_text = " ".join(tok_tokens)
+                #
+                # # De-tokenize WordPieces that have been split off.
+                # tok_text = tok_text.replace(" ##", "")
+                # tok_text = tok_text.replace("##", "")
+
+                # Clean whitespace
+                tok_text = tok_text.strip()
+                tok_text = " ".join(tok_text.split())
+                orig_text = " ".join(orig_tokens)
+
+                final_text = get_final_text(tok_text, orig_text, do_lower_case, verbose_logging)
+                if final_text in seen_predictions:
+                    continue
+
+                seen_predictions[final_text] = True
+            else:
+                final_text = ""
+                seen_predictions[final_text] = True
+
+            nbest.append(_NbestPrediction(text=final_text, start_logit=pred.start_logit, end_logit=pred.end_logit))
+        # if we didn't include the empty option in the n-best, include it
+        if version_2_with_negative:
+            if "" not in seen_predictions:
+                nbest.append(_NbestPrediction(text="", start_logit=null_start_logit, end_logit=null_end_logit))
+
+            # In very rare edge cases we could only have single null prediction.
+            # So we just create a nonce prediction in this case to avoid failure.
+            if len(nbest) == 1:
+                nbest.insert(0, _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0))
+
+        # In very rare edge cases we could have no valid predictions. So we
+        # just create a nonce prediction in this case to avoid failure.
+        if not nbest:
+            nbest.append(_NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0))
+
+        if len(nbest) < 1:
+            raise ValueError("No valid predictions")
+
+        total_scores = []
+        best_non_null_entry = None
+        for entry in nbest:
+            total_scores.append(entry.start_logit + entry.end_logit)
+            if not best_non_null_entry:
+                if entry.text:
+                    best_non_null_entry = entry
+
+        probs = _compute_softmax(total_scores)
+
+        nbest_json = []
+        for i, entry in enumerate(nbest):
+            output = collections.OrderedDict()
+            output["text"] = entry.text
+            output["probability"] = probs[i]
+            output["start_logit"] = entry.start_logit
+            output["end_logit"] = entry.end_logit
+            nbest_json.append(output)
+
+        if len(nbest_json) < 1:
+            raise ValueError("No valid predictions")
+
+        if not version_2_with_negative:
+            all_predictions[example.qas_id] = nbest_json[0]["text"]
+        else:
+            # predict "" iff the null score - the score of best non-null > threshold
+            score_diff = score_null - best_non_null_entry.start_logit - (best_non_null_entry.end_logit)
+            scores_diff_json[example.qas_id] = score_diff
+            if score_diff > null_score_diff_threshold:
+                all_predictions[example.qas_id] = ""
+            else:
+                all_predictions[example.qas_id] = best_non_null_entry.text
+        all_nbest_json[example.qas_id] = nbest_json
+
+    if output_prediction_file:
+        with open(output_prediction_file, "w") as writer:
+            writer.write(json.dumps(all_predictions, indent=4) + "\n")
+
+    if output_nbest_file:
+        with open(output_nbest_file, "w") as writer:
+            writer.write(json.dumps(all_nbest_json, indent=4) + "\n")
+
+    if output_null_log_odds_file and version_2_with_negative:
+        with open(output_null_log_odds_file, "w") as writer:
+            writer.write(json.dumps(scores_diff_json, indent=4) + "\n")
+
+    return all_predictions
+
+
+def compute_predictions_log_probs(
+    all_examples,
+    all_features,
+    all_results,
+    n_best_size,
+    max_answer_length,
+    output_prediction_file,
+    output_nbest_file,
+    output_null_log_odds_file,
+    start_n_top,
+    end_n_top,
+    version_2_with_negative,
+    tokenizer,
+    verbose_logging,
+):
+    """
+    XLNet write prediction logic (more complex than Bert's). Write final predictions to the json file and log-odds of
+    null if needed.
+
+    Requires utils_squad_evaluate.py
+    """
+    _PrelimPrediction = collections.namedtuple(  # pylint: disable=invalid-name
+        "PrelimPrediction", ["feature_index", "start_index", "end_index", "start_log_prob", "end_log_prob"]
+    )
+
+    _NbestPrediction = collections.namedtuple(  # pylint: disable=invalid-name
+        "NbestPrediction", ["text", "start_log_prob", "end_log_prob"]
+    )
+
+    logger.info(f"Writing predictions to: {output_prediction_file}")
+
+    example_index_to_features = collections.defaultdict(list)
+    for feature in all_features:
+        example_index_to_features[feature.example_index].append(feature)
+
+    unique_id_to_result = {}
+    for result in all_results:
+        unique_id_to_result[result.unique_id] = result
+
+    all_predictions = collections.OrderedDict()
+    all_nbest_json = collections.OrderedDict()
+    scores_diff_json = collections.OrderedDict()
+
+    for example_index, example in enumerate(all_examples):
+        features = example_index_to_features[example_index]
+
+        prelim_predictions = []
+        # keep track of the minimum score of null start+end of position 0
+        score_null = 1000000  # large and positive
+
+        for feature_index, feature in enumerate(features):
+            result = unique_id_to_result[feature.unique_id]
+
+            cur_null_score = result.cls_logits
+
+            # if we could have irrelevant answers, get the min score of irrelevant
+            score_null = min(score_null, cur_null_score)
+
+            for i in range(start_n_top):
+                for j in range(end_n_top):
+                    start_log_prob = result.start_logits[i]
+                    start_index = result.start_top_index[i]
+
+                    j_index = i * end_n_top + j
+
+                    end_log_prob = result.end_logits[j_index]
+                    end_index = result.end_top_index[j_index]
+
+                    # We could hypothetically create invalid predictions, e.g., predict
+                    # that the start of the span is in the question. We throw out all
+                    # invalid predictions.
+                    if start_index >= feature.paragraph_len - 1:
+                        continue
+                    if end_index >= feature.paragraph_len - 1:
+                        continue
+
+                    if not feature.token_is_max_context.get(start_index, False):
+                        continue
+                    if end_index < start_index:
+                        continue
+                    length = end_index - start_index + 1
+                    if length > max_answer_length:
+                        continue
+
+                    prelim_predictions.append(
+                        _PrelimPrediction(
+                            feature_index=feature_index,
+                            start_index=start_index,
+                            end_index=end_index,
+                            start_log_prob=start_log_prob,
+                            end_log_prob=end_log_prob,
+                        )
+                    )
+
+        prelim_predictions = sorted(
+            prelim_predictions, key=lambda x: (x.start_log_prob + x.end_log_prob), reverse=True
+        )
+
+        seen_predictions = {}
+        nbest = []
+        for pred in prelim_predictions:
+            if len(nbest) >= n_best_size:
+                break
+            feature = features[pred.feature_index]
+
+            # XLNet un-tokenizer
+            # Let's keep it simple for now and see if we need all this later.
+            #
+            # tok_start_to_orig_index = feature.tok_start_to_orig_index
+            # tok_end_to_orig_index = feature.tok_end_to_orig_index
+            # start_orig_pos = tok_start_to_orig_index[pred.start_index]
+            # end_orig_pos = tok_end_to_orig_index[pred.end_index]
+            # paragraph_text = example.paragraph_text
+            # final_text = paragraph_text[start_orig_pos: end_orig_pos + 1].strip()
+
+            # Previously used Bert untokenizer
+            tok_tokens = feature.tokens[pred.start_index : (pred.end_index + 1)]
+            orig_doc_start = feature.token_to_orig_map[pred.start_index]
+            orig_doc_end = feature.token_to_orig_map[pred.end_index]
+            orig_tokens = example.doc_tokens[orig_doc_start : (orig_doc_end + 1)]
+            tok_text = tokenizer.convert_tokens_to_string(tok_tokens)
+
+            # Clean whitespace
+            tok_text = tok_text.strip()
+            tok_text = " ".join(tok_text.split())
+            orig_text = " ".join(orig_tokens)
+
+            if hasattr(tokenizer, "do_lower_case"):
+                do_lower_case = tokenizer.do_lower_case
+            else:
+                do_lower_case = tokenizer.do_lowercase_and_remove_accent
+
+            final_text = get_final_text(tok_text, orig_text, do_lower_case, verbose_logging)
+
+            if final_text in seen_predictions:
+                continue
+
+            seen_predictions[final_text] = True
+
+            nbest.append(
+                _NbestPrediction(text=final_text, start_log_prob=pred.start_log_prob, end_log_prob=pred.end_log_prob)
+            )
+
+        # In very rare edge cases we could have no valid predictions. So we
+        # just create a nonce prediction in this case to avoid failure.
+        if not nbest:
+            nbest.append(_NbestPrediction(text="", start_log_prob=-1e6, end_log_prob=-1e6))
+
+        total_scores = []
+        best_non_null_entry = None
+        for entry in nbest:
+            total_scores.append(entry.start_log_prob + entry.end_log_prob)
+            if not best_non_null_entry:
+                best_non_null_entry = entry
+
+        probs = _compute_softmax(total_scores)
+
+        nbest_json = []
+        for i, entry in enumerate(nbest):
+            output = collections.OrderedDict()
+            output["text"] = entry.text
+            output["probability"] = probs[i]
+            output["start_log_prob"] = entry.start_log_prob
+            output["end_log_prob"] = entry.end_log_prob
+            nbest_json.append(output)
+
+        if len(nbest_json) < 1:
+            raise ValueError("No valid predictions")
+        if best_non_null_entry is None:
+            raise ValueError("No valid predictions")
+
+        score_diff = score_null
+        scores_diff_json[example.qas_id] = score_diff
+        # note(zhiliny): always predict best_non_null_entry
+        # and the evaluation script will search for the best threshold
+        all_predictions[example.qas_id] = best_non_null_entry.text
+
+        all_nbest_json[example.qas_id] = nbest_json
+
+    with open(output_prediction_file, "w") as writer:
+        writer.write(json.dumps(all_predictions, indent=4) + "\n")
+
+    with open(output_nbest_file, "w") as writer:
+        writer.write(json.dumps(all_nbest_json, indent=4) + "\n")
+
+    if version_2_with_negative:
+        with open(output_null_log_odds_file, "w") as writer:
+            writer.write(json.dumps(scores_diff_json, indent=4) + "\n")
+
+    return all_predictions
diff --git a/.venv/lib/python3.11/site-packages/transformers/data/processors/__init__.py b/.venv/lib/python3.11/site-packages/transformers/data/processors/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a26ab5776d74715428b10c4d9cd943e53b253785
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/data/processors/__init__.py
@@ -0,0 +1,18 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .glue import glue_convert_examples_to_features, glue_output_modes, glue_processors, glue_tasks_num_labels
+from .squad import SquadExample, SquadFeatures, SquadV1Processor, SquadV2Processor, squad_convert_examples_to_features
+from .utils import DataProcessor, InputExample, InputFeatures, SingleSentenceClassificationProcessor
+from .xnli import xnli_output_modes, xnli_processors, xnli_tasks_num_labels
diff --git a/.venv/lib/python3.11/site-packages/transformers/data/processors/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/data/processors/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9fca4bd440198f640abdd9dba25095652571af0e
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/data/processors/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/data/processors/__pycache__/glue.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/data/processors/__pycache__/glue.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d12d5ea9c4b564454d4e02e40768d5b051548187
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/data/processors/__pycache__/glue.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/data/processors/__pycache__/squad.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/data/processors/__pycache__/squad.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c87ad2ed1b893dae18e27f264924a952dfe5d0a2
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/data/processors/__pycache__/squad.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/data/processors/__pycache__/utils.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/data/processors/__pycache__/utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c2d25b15d40952122c576aba045dfb39038e95e2
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/data/processors/__pycache__/utils.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/data/processors/__pycache__/xnli.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/data/processors/__pycache__/xnli.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5884eb599a266072b5a06fc7babd4ef833f29d82
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/data/processors/__pycache__/xnli.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/data/processors/glue.py b/.venv/lib/python3.11/site-packages/transformers/data/processors/glue.py
new file mode 100644
index 0000000000000000000000000000000000000000..6e95a66684988ac21b628d3066bc7d0bf3964ea9
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/data/processors/glue.py
@@ -0,0 +1,643 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""GLUE processors and helpers"""
+
+import os
+import warnings
+from dataclasses import asdict
+from enum import Enum
+from typing import List, Optional, Union
+
+from ...tokenization_utils import PreTrainedTokenizer
+from ...utils import is_tf_available, logging
+from .utils import DataProcessor, InputExample, InputFeatures
+
+
+if is_tf_available():
+    import tensorflow as tf
+
+logger = logging.get_logger(__name__)
+
+DEPRECATION_WARNING = (
+    "This {0} will be removed from the library soon, preprocessing should be handled with the 🤗 Datasets "
+    "library. You can have a look at this example script for pointers: "
+    "https://github.com/huggingface/transformers/blob/main/examples/pytorch/text-classification/run_glue.py"
+)
+
+
+def glue_convert_examples_to_features(
+    examples: Union[List[InputExample], "tf.data.Dataset"],
+    tokenizer: PreTrainedTokenizer,
+    max_length: Optional[int] = None,
+    task=None,
+    label_list=None,
+    output_mode=None,
+):
+    """
+    Loads a data file into a list of `InputFeatures`
+
+    Args:
+        examples: List of `InputExamples` or `tf.data.Dataset` containing the examples.
+        tokenizer: Instance of a tokenizer that will tokenize the examples
+        max_length: Maximum example length. Defaults to the tokenizer's max_len
+        task: GLUE task
+        label_list: List of labels. Can be obtained from the processor using the `processor.get_labels()` method
+        output_mode: String indicating the output mode. Either `regression` or `classification`
+
+    Returns:
+        If the `examples` input is a `tf.data.Dataset`, will return a `tf.data.Dataset` containing the task-specific
+        features. If the input is a list of `InputExamples`, will return a list of task-specific `InputFeatures` which
+        can be fed to the model.
+
+    """
+    warnings.warn(DEPRECATION_WARNING.format("function"), FutureWarning)
+    if is_tf_available() and isinstance(examples, tf.data.Dataset):
+        if task is None:
+            raise ValueError("When calling glue_convert_examples_to_features from TF, the task parameter is required.")
+        return _tf_glue_convert_examples_to_features(examples, tokenizer, max_length=max_length, task=task)
+    return _glue_convert_examples_to_features(
+        examples, tokenizer, max_length=max_length, task=task, label_list=label_list, output_mode=output_mode
+    )
+
+
+if is_tf_available():
+
+    def _tf_glue_convert_examples_to_features(
+        examples: tf.data.Dataset,
+        tokenizer: PreTrainedTokenizer,
+        task=str,
+        max_length: Optional[int] = None,
+    ) -> tf.data.Dataset:
+        """
+        Returns:
+            A `tf.data.Dataset` containing the task-specific features.
+
+        """
+        processor = glue_processors[task]()
+        examples = [processor.tfds_map(processor.get_example_from_tensor_dict(example)) for example in examples]
+        features = glue_convert_examples_to_features(examples, tokenizer, max_length=max_length, task=task)
+        label_type = tf.float32 if task == "sts-b" else tf.int64
+
+        def gen():
+            for ex in features:
+                d = {k: v for k, v in asdict(ex).items() if v is not None}
+                label = d.pop("label")
+                yield (d, label)
+
+        input_names = tokenizer.model_input_names
+
+        return tf.data.Dataset.from_generator(
+            gen,
+            ({k: tf.int32 for k in input_names}, label_type),
+            ({k: tf.TensorShape([None]) for k in input_names}, tf.TensorShape([])),
+        )
+
+
+def _glue_convert_examples_to_features(
+    examples: List[InputExample],
+    tokenizer: PreTrainedTokenizer,
+    max_length: Optional[int] = None,
+    task=None,
+    label_list=None,
+    output_mode=None,
+):
+    if max_length is None:
+        max_length = tokenizer.model_max_length
+
+    if task is not None:
+        processor = glue_processors[task]()
+        if label_list is None:
+            label_list = processor.get_labels()
+            logger.info(f"Using label list {label_list} for task {task}")
+        if output_mode is None:
+            output_mode = glue_output_modes[task]
+            logger.info(f"Using output mode {output_mode} for task {task}")
+
+    label_map = {label: i for i, label in enumerate(label_list)}
+
+    def label_from_example(example: InputExample) -> Union[int, float, None]:
+        if example.label is None:
+            return None
+        if output_mode == "classification":
+            return label_map[example.label]
+        elif output_mode == "regression":
+            return float(example.label)
+        raise KeyError(output_mode)
+
+    labels = [label_from_example(example) for example in examples]
+
+    batch_encoding = tokenizer(
+        [(example.text_a, example.text_b) for example in examples],
+        max_length=max_length,
+        padding="max_length",
+        truncation=True,
+    )
+
+    features = []
+    for i in range(len(examples)):
+        inputs = {k: batch_encoding[k][i] for k in batch_encoding}
+
+        feature = InputFeatures(**inputs, label=labels[i])
+        features.append(feature)
+
+    for i, example in enumerate(examples[:5]):
+        logger.info("*** Example ***")
+        logger.info(f"guid: {example.guid}")
+        logger.info(f"features: {features[i]}")
+
+    return features
+
+
+class OutputMode(Enum):
+    classification = "classification"
+    regression = "regression"
+
+
+class MrpcProcessor(DataProcessor):
+    """Processor for the MRPC data set (GLUE version)."""
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        warnings.warn(DEPRECATION_WARNING.format("processor"), FutureWarning)
+
+    def get_example_from_tensor_dict(self, tensor_dict):
+        """See base class."""
+        return InputExample(
+            tensor_dict["idx"].numpy(),
+            tensor_dict["sentence1"].numpy().decode("utf-8"),
+            tensor_dict["sentence2"].numpy().decode("utf-8"),
+            str(tensor_dict["label"].numpy()),
+        )
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        logger.info(f"LOOKING AT {os.path.join(data_dir, 'train.tsv')}")
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+
+    def get_test_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
+
+    def get_labels(self):
+        """See base class."""
+        return ["0", "1"]
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training, dev and test sets."""
+        examples = []
+        for i, line in enumerate(lines):
+            if i == 0:
+                continue
+            guid = f"{set_type}-{i}"
+            text_a = line[3]
+            text_b = line[4]
+            label = None if set_type == "test" else line[0]
+            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+        return examples
+
+
+class MnliProcessor(DataProcessor):
+    """Processor for the MultiNLI data set (GLUE version)."""
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        warnings.warn(DEPRECATION_WARNING.format("processor"), FutureWarning)
+
+    def get_example_from_tensor_dict(self, tensor_dict):
+        """See base class."""
+        return InputExample(
+            tensor_dict["idx"].numpy(),
+            tensor_dict["premise"].numpy().decode("utf-8"),
+            tensor_dict["hypothesis"].numpy().decode("utf-8"),
+            str(tensor_dict["label"].numpy()),
+        )
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev_matched.tsv")), "dev_matched")
+
+    def get_test_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "test_matched.tsv")), "test_matched")
+
+    def get_labels(self):
+        """See base class."""
+        return ["contradiction", "entailment", "neutral"]
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training, dev and test sets."""
+        examples = []
+        for i, line in enumerate(lines):
+            if i == 0:
+                continue
+            guid = f"{set_type}-{line[0]}"
+            text_a = line[8]
+            text_b = line[9]
+            label = None if set_type.startswith("test") else line[-1]
+            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+        return examples
+
+
+class MnliMismatchedProcessor(MnliProcessor):
+    """Processor for the MultiNLI Mismatched data set (GLUE version)."""
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        warnings.warn(DEPRECATION_WARNING.format("processor"), FutureWarning)
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev_mismatched.tsv")), "dev_mismatched")
+
+    def get_test_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "test_mismatched.tsv")), "test_mismatched")
+
+
+class ColaProcessor(DataProcessor):
+    """Processor for the CoLA data set (GLUE version)."""
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        warnings.warn(DEPRECATION_WARNING.format("processor"), FutureWarning)
+
+    def get_example_from_tensor_dict(self, tensor_dict):
+        """See base class."""
+        return InputExample(
+            tensor_dict["idx"].numpy(),
+            tensor_dict["sentence"].numpy().decode("utf-8"),
+            None,
+            str(tensor_dict["label"].numpy()),
+        )
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+
+    def get_test_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
+
+    def get_labels(self):
+        """See base class."""
+        return ["0", "1"]
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training, dev and test sets."""
+        test_mode = set_type == "test"
+        if test_mode:
+            lines = lines[1:]
+        text_index = 1 if test_mode else 3
+        examples = []
+        for i, line in enumerate(lines):
+            guid = f"{set_type}-{i}"
+            text_a = line[text_index]
+            label = None if test_mode else line[1]
+            examples.append(InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
+        return examples
+
+
+class Sst2Processor(DataProcessor):
+    """Processor for the SST-2 data set (GLUE version)."""
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        warnings.warn(DEPRECATION_WARNING.format("processor"), FutureWarning)
+
+    def get_example_from_tensor_dict(self, tensor_dict):
+        """See base class."""
+        return InputExample(
+            tensor_dict["idx"].numpy(),
+            tensor_dict["sentence"].numpy().decode("utf-8"),
+            None,
+            str(tensor_dict["label"].numpy()),
+        )
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+
+    def get_test_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
+
+    def get_labels(self):
+        """See base class."""
+        return ["0", "1"]
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training, dev and test sets."""
+        examples = []
+        text_index = 1 if set_type == "test" else 0
+        for i, line in enumerate(lines):
+            if i == 0:
+                continue
+            guid = f"{set_type}-{i}"
+            text_a = line[text_index]
+            label = None if set_type == "test" else line[1]
+            examples.append(InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
+        return examples
+
+
+class StsbProcessor(DataProcessor):
+    """Processor for the STS-B data set (GLUE version)."""
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        warnings.warn(DEPRECATION_WARNING.format("processor"), FutureWarning)
+
+    def get_example_from_tensor_dict(self, tensor_dict):
+        """See base class."""
+        return InputExample(
+            tensor_dict["idx"].numpy(),
+            tensor_dict["sentence1"].numpy().decode("utf-8"),
+            tensor_dict["sentence2"].numpy().decode("utf-8"),
+            str(tensor_dict["label"].numpy()),
+        )
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+
+    def get_test_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
+
+    def get_labels(self):
+        """See base class."""
+        return [None]
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training, dev and test sets."""
+        examples = []
+        for i, line in enumerate(lines):
+            if i == 0:
+                continue
+            guid = f"{set_type}-{line[0]}"
+            text_a = line[7]
+            text_b = line[8]
+            label = None if set_type == "test" else line[-1]
+            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+        return examples
+
+
+class QqpProcessor(DataProcessor):
+    """Processor for the QQP data set (GLUE version)."""
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        warnings.warn(DEPRECATION_WARNING.format("processor"), FutureWarning)
+
+    def get_example_from_tensor_dict(self, tensor_dict):
+        """See base class."""
+        return InputExample(
+            tensor_dict["idx"].numpy(),
+            tensor_dict["question1"].numpy().decode("utf-8"),
+            tensor_dict["question2"].numpy().decode("utf-8"),
+            str(tensor_dict["label"].numpy()),
+        )
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+
+    def get_test_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
+
+    def get_labels(self):
+        """See base class."""
+        return ["0", "1"]
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training, dev and test sets."""
+        test_mode = set_type == "test"
+        q1_index = 1 if test_mode else 3
+        q2_index = 2 if test_mode else 4
+        examples = []
+        for i, line in enumerate(lines):
+            if i == 0:
+                continue
+            guid = f"{set_type}-{line[0]}"
+            try:
+                text_a = line[q1_index]
+                text_b = line[q2_index]
+                label = None if test_mode else line[5]
+            except IndexError:
+                continue
+            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+        return examples
+
+
+class QnliProcessor(DataProcessor):
+    """Processor for the QNLI data set (GLUE version)."""
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        warnings.warn(DEPRECATION_WARNING.format("processor"), FutureWarning)
+
+    def get_example_from_tensor_dict(self, tensor_dict):
+        """See base class."""
+        return InputExample(
+            tensor_dict["idx"].numpy(),
+            tensor_dict["question"].numpy().decode("utf-8"),
+            tensor_dict["sentence"].numpy().decode("utf-8"),
+            str(tensor_dict["label"].numpy()),
+        )
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+
+    def get_test_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
+
+    def get_labels(self):
+        """See base class."""
+        return ["entailment", "not_entailment"]
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training, dev and test sets."""
+        examples = []
+        for i, line in enumerate(lines):
+            if i == 0:
+                continue
+            guid = f"{set_type}-{line[0]}"
+            text_a = line[1]
+            text_b = line[2]
+            label = None if set_type == "test" else line[-1]
+            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+        return examples
+
+
+class RteProcessor(DataProcessor):
+    """Processor for the RTE data set (GLUE version)."""
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        warnings.warn(DEPRECATION_WARNING.format("processor"), FutureWarning)
+
+    def get_example_from_tensor_dict(self, tensor_dict):
+        """See base class."""
+        return InputExample(
+            tensor_dict["idx"].numpy(),
+            tensor_dict["sentence1"].numpy().decode("utf-8"),
+            tensor_dict["sentence2"].numpy().decode("utf-8"),
+            str(tensor_dict["label"].numpy()),
+        )
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+
+    def get_test_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
+
+    def get_labels(self):
+        """See base class."""
+        return ["entailment", "not_entailment"]
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training, dev and test sets."""
+        examples = []
+        for i, line in enumerate(lines):
+            if i == 0:
+                continue
+            guid = f"{set_type}-{line[0]}"
+            text_a = line[1]
+            text_b = line[2]
+            label = None if set_type == "test" else line[-1]
+            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+        return examples
+
+
+class WnliProcessor(DataProcessor):
+    """Processor for the WNLI data set (GLUE version)."""
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        warnings.warn(DEPRECATION_WARNING.format("processor"), FutureWarning)
+
+    def get_example_from_tensor_dict(self, tensor_dict):
+        """See base class."""
+        return InputExample(
+            tensor_dict["idx"].numpy(),
+            tensor_dict["sentence1"].numpy().decode("utf-8"),
+            tensor_dict["sentence2"].numpy().decode("utf-8"),
+            str(tensor_dict["label"].numpy()),
+        )
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+
+    def get_test_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
+
+    def get_labels(self):
+        """See base class."""
+        return ["0", "1"]
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training, dev and test sets."""
+        examples = []
+        for i, line in enumerate(lines):
+            if i == 0:
+                continue
+            guid = f"{set_type}-{line[0]}"
+            text_a = line[1]
+            text_b = line[2]
+            label = None if set_type == "test" else line[-1]
+            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+        return examples
+
+
+glue_tasks_num_labels = {
+    "cola": 2,
+    "mnli": 3,
+    "mrpc": 2,
+    "sst-2": 2,
+    "sts-b": 1,
+    "qqp": 2,
+    "qnli": 2,
+    "rte": 2,
+    "wnli": 2,
+}
+
+glue_processors = {
+    "cola": ColaProcessor,
+    "mnli": MnliProcessor,
+    "mnli-mm": MnliMismatchedProcessor,
+    "mrpc": MrpcProcessor,
+    "sst-2": Sst2Processor,
+    "sts-b": StsbProcessor,
+    "qqp": QqpProcessor,
+    "qnli": QnliProcessor,
+    "rte": RteProcessor,
+    "wnli": WnliProcessor,
+}
+
+glue_output_modes = {
+    "cola": "classification",
+    "mnli": "classification",
+    "mnli-mm": "classification",
+    "mrpc": "classification",
+    "sst-2": "classification",
+    "sts-b": "regression",
+    "qqp": "classification",
+    "qnli": "classification",
+    "rte": "classification",
+    "wnli": "classification",
+}
diff --git a/.venv/lib/python3.11/site-packages/transformers/data/processors/squad.py b/.venv/lib/python3.11/site-packages/transformers/data/processors/squad.py
new file mode 100644
index 0000000000000000000000000000000000000000..4677af124e9b1042287670a7239ec7cd815801f6
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/data/processors/squad.py
@@ -0,0 +1,845 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+from functools import partial
+from multiprocessing import Pool, cpu_count
+
+import numpy as np
+from tqdm import tqdm
+
+from ...models.bert.tokenization_bert import whitespace_tokenize
+from ...tokenization_utils_base import BatchEncoding, PreTrainedTokenizerBase, TruncationStrategy
+from ...utils import is_tf_available, is_torch_available, logging
+from .utils import DataProcessor
+
+
+# Store the tokenizers which insert 2 separators tokens
+MULTI_SEP_TOKENS_TOKENIZERS_SET = {"roberta", "camembert", "bart", "mpnet"}
+
+
+if is_torch_available():
+    import torch
+    from torch.utils.data import TensorDataset
+
+if is_tf_available():
+    import tensorflow as tf
+
+logger = logging.get_logger(__name__)
+
+
+def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer, orig_answer_text):
+    """Returns tokenized answer spans that better match the annotated answer."""
+    tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text))
+
+    for new_start in range(input_start, input_end + 1):
+        for new_end in range(input_end, new_start - 1, -1):
+            text_span = " ".join(doc_tokens[new_start : (new_end + 1)])
+            if text_span == tok_answer_text:
+                return (new_start, new_end)
+
+    return (input_start, input_end)
+
+
+def _check_is_max_context(doc_spans, cur_span_index, position):
+    """Check if this is the 'max context' doc span for the token."""
+    best_score = None
+    best_span_index = None
+    for span_index, doc_span in enumerate(doc_spans):
+        end = doc_span.start + doc_span.length - 1
+        if position < doc_span.start:
+            continue
+        if position > end:
+            continue
+        num_left_context = position - doc_span.start
+        num_right_context = end - position
+        score = min(num_left_context, num_right_context) + 0.01 * doc_span.length
+        if best_score is None or score > best_score:
+            best_score = score
+            best_span_index = span_index
+
+    return cur_span_index == best_span_index
+
+
+def _new_check_is_max_context(doc_spans, cur_span_index, position):
+    """Check if this is the 'max context' doc span for the token."""
+    # if len(doc_spans) == 1:
+    # return True
+    best_score = None
+    best_span_index = None
+    for span_index, doc_span in enumerate(doc_spans):
+        end = doc_span["start"] + doc_span["length"] - 1
+        if position < doc_span["start"]:
+            continue
+        if position > end:
+            continue
+        num_left_context = position - doc_span["start"]
+        num_right_context = end - position
+        score = min(num_left_context, num_right_context) + 0.01 * doc_span["length"]
+        if best_score is None or score > best_score:
+            best_score = score
+            best_span_index = span_index
+
+    return cur_span_index == best_span_index
+
+
+def _is_whitespace(c):
+    if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
+        return True
+    return False
+
+
+def squad_convert_example_to_features(
+    example, max_seq_length, doc_stride, max_query_length, padding_strategy, is_training
+):
+    features = []
+    if is_training and not example.is_impossible:
+        # Get start and end position
+        start_position = example.start_position
+        end_position = example.end_position
+
+        # If the answer cannot be found in the text, then skip this example.
+        actual_text = " ".join(example.doc_tokens[start_position : (end_position + 1)])
+        cleaned_answer_text = " ".join(whitespace_tokenize(example.answer_text))
+        if actual_text.find(cleaned_answer_text) == -1:
+            logger.warning(f"Could not find answer: '{actual_text}' vs. '{cleaned_answer_text}'")
+            return []
+
+    tok_to_orig_index = []
+    orig_to_tok_index = []
+    all_doc_tokens = []
+    for i, token in enumerate(example.doc_tokens):
+        orig_to_tok_index.append(len(all_doc_tokens))
+        if tokenizer.__class__.__name__ in [
+            "RobertaTokenizer",
+            "LongformerTokenizer",
+            "BartTokenizer",
+            "RobertaTokenizerFast",
+            "LongformerTokenizerFast",
+            "BartTokenizerFast",
+        ]:
+            sub_tokens = tokenizer.tokenize(token, add_prefix_space=True)
+        else:
+            sub_tokens = tokenizer.tokenize(token)
+        for sub_token in sub_tokens:
+            tok_to_orig_index.append(i)
+            all_doc_tokens.append(sub_token)
+
+    if is_training and not example.is_impossible:
+        tok_start_position = orig_to_tok_index[example.start_position]
+        if example.end_position < len(example.doc_tokens) - 1:
+            tok_end_position = orig_to_tok_index[example.end_position + 1] - 1
+        else:
+            tok_end_position = len(all_doc_tokens) - 1
+
+        (tok_start_position, tok_end_position) = _improve_answer_span(
+            all_doc_tokens, tok_start_position, tok_end_position, tokenizer, example.answer_text
+        )
+
+    spans = []
+
+    truncated_query = tokenizer.encode(
+        example.question_text, add_special_tokens=False, truncation=True, max_length=max_query_length
+    )
+
+    # Tokenizers who insert 2 SEP tokens in-between <context> & <question> need to have special handling
+    # in the way they compute mask of added tokens.
+    tokenizer_type = type(tokenizer).__name__.replace("Tokenizer", "").lower()
+    sequence_added_tokens = (
+        tokenizer.model_max_length - tokenizer.max_len_single_sentence + 1
+        if tokenizer_type in MULTI_SEP_TOKENS_TOKENIZERS_SET
+        else tokenizer.model_max_length - tokenizer.max_len_single_sentence
+    )
+    sequence_pair_added_tokens = tokenizer.model_max_length - tokenizer.max_len_sentences_pair
+
+    span_doc_tokens = all_doc_tokens
+    while len(spans) * doc_stride < len(all_doc_tokens):
+        # Define the side we want to truncate / pad and the text/pair sorting
+        if tokenizer.padding_side == "right":
+            texts = truncated_query
+            pairs = span_doc_tokens
+            truncation = TruncationStrategy.ONLY_SECOND.value
+        else:
+            texts = span_doc_tokens
+            pairs = truncated_query
+            truncation = TruncationStrategy.ONLY_FIRST.value
+
+        encoded_dict = tokenizer.encode_plus(  # TODO(thom) update this logic
+            texts,
+            pairs,
+            truncation=truncation,
+            padding=padding_strategy,
+            max_length=max_seq_length,
+            return_overflowing_tokens=True,
+            stride=max_seq_length - doc_stride - len(truncated_query) - sequence_pair_added_tokens,
+            return_token_type_ids=True,
+        )
+
+        paragraph_len = min(
+            len(all_doc_tokens) - len(spans) * doc_stride,
+            max_seq_length - len(truncated_query) - sequence_pair_added_tokens,
+        )
+
+        if tokenizer.pad_token_id in encoded_dict["input_ids"]:
+            if tokenizer.padding_side == "right":
+                non_padded_ids = encoded_dict["input_ids"][: encoded_dict["input_ids"].index(tokenizer.pad_token_id)]
+            else:
+                last_padding_id_position = (
+                    len(encoded_dict["input_ids"]) - 1 - encoded_dict["input_ids"][::-1].index(tokenizer.pad_token_id)
+                )
+                non_padded_ids = encoded_dict["input_ids"][last_padding_id_position + 1 :]
+
+        else:
+            non_padded_ids = encoded_dict["input_ids"]
+
+        tokens = tokenizer.convert_ids_to_tokens(non_padded_ids)
+
+        token_to_orig_map = {}
+        for i in range(paragraph_len):
+            index = len(truncated_query) + sequence_added_tokens + i if tokenizer.padding_side == "right" else i
+            token_to_orig_map[index] = tok_to_orig_index[len(spans) * doc_stride + i]
+
+        encoded_dict["paragraph_len"] = paragraph_len
+        encoded_dict["tokens"] = tokens
+        encoded_dict["token_to_orig_map"] = token_to_orig_map
+        encoded_dict["truncated_query_with_special_tokens_length"] = len(truncated_query) + sequence_added_tokens
+        encoded_dict["token_is_max_context"] = {}
+        encoded_dict["start"] = len(spans) * doc_stride
+        encoded_dict["length"] = paragraph_len
+
+        spans.append(encoded_dict)
+
+        if "overflowing_tokens" not in encoded_dict or (
+            "overflowing_tokens" in encoded_dict and len(encoded_dict["overflowing_tokens"]) == 0
+        ):
+            break
+        span_doc_tokens = encoded_dict["overflowing_tokens"]
+
+    for doc_span_index in range(len(spans)):
+        for j in range(spans[doc_span_index]["paragraph_len"]):
+            is_max_context = _new_check_is_max_context(spans, doc_span_index, doc_span_index * doc_stride + j)
+            index = (
+                j
+                if tokenizer.padding_side == "left"
+                else spans[doc_span_index]["truncated_query_with_special_tokens_length"] + j
+            )
+            spans[doc_span_index]["token_is_max_context"][index] = is_max_context
+
+    for span in spans:
+        # Identify the position of the CLS token
+        cls_index = span["input_ids"].index(tokenizer.cls_token_id)
+
+        # p_mask: mask with 1 for token than cannot be in the answer (0 for token which can be in an answer)
+        # Original TF implementation also keep the classification token (set to 0)
+        p_mask = np.ones_like(span["token_type_ids"])
+        if tokenizer.padding_side == "right":
+            p_mask[len(truncated_query) + sequence_added_tokens :] = 0
+        else:
+            p_mask[-len(span["tokens"]) : -(len(truncated_query) + sequence_added_tokens)] = 0
+
+        pad_token_indices = np.where(np.atleast_1d(span["input_ids"] == tokenizer.pad_token_id))
+        special_token_indices = np.asarray(
+            tokenizer.get_special_tokens_mask(span["input_ids"], already_has_special_tokens=True)
+        ).nonzero()
+
+        p_mask[pad_token_indices] = 1
+        p_mask[special_token_indices] = 1
+
+        # Set the cls index to 0: the CLS index can be used for impossible answers
+        p_mask[cls_index] = 0
+
+        span_is_impossible = example.is_impossible
+        start_position = 0
+        end_position = 0
+        if is_training and not span_is_impossible:
+            # For training, if our document chunk does not contain an annotation
+            # we throw it out, since there is nothing to predict.
+            doc_start = span["start"]
+            doc_end = span["start"] + span["length"] - 1
+            out_of_span = False
+
+            if not (tok_start_position >= doc_start and tok_end_position <= doc_end):
+                out_of_span = True
+
+            if out_of_span:
+                start_position = cls_index
+                end_position = cls_index
+                span_is_impossible = True
+            else:
+                if tokenizer.padding_side == "left":
+                    doc_offset = 0
+                else:
+                    doc_offset = len(truncated_query) + sequence_added_tokens
+
+                start_position = tok_start_position - doc_start + doc_offset
+                end_position = tok_end_position - doc_start + doc_offset
+
+        features.append(
+            SquadFeatures(
+                span["input_ids"],
+                span["attention_mask"],
+                span["token_type_ids"],
+                cls_index,
+                p_mask.tolist(),
+                example_index=0,  # Can not set unique_id and example_index here. They will be set after multiple processing.
+                unique_id=0,
+                paragraph_len=span["paragraph_len"],
+                token_is_max_context=span["token_is_max_context"],
+                tokens=span["tokens"],
+                token_to_orig_map=span["token_to_orig_map"],
+                start_position=start_position,
+                end_position=end_position,
+                is_impossible=span_is_impossible,
+                qas_id=example.qas_id,
+            )
+        )
+    return features
+
+
+def squad_convert_example_to_features_init(tokenizer_for_convert: PreTrainedTokenizerBase):
+    global tokenizer
+    tokenizer = tokenizer_for_convert
+
+
+def squad_convert_examples_to_features(
+    examples,
+    tokenizer,
+    max_seq_length,
+    doc_stride,
+    max_query_length,
+    is_training,
+    padding_strategy="max_length",
+    return_dataset=False,
+    threads=1,
+    tqdm_enabled=True,
+):
+    """
+    Converts a list of examples into a list of features that can be directly given as input to a model. It is
+    model-dependant and takes advantage of many of the tokenizer's features to create the model's inputs.
+
+    Args:
+        examples: list of [`~data.processors.squad.SquadExample`]
+        tokenizer: an instance of a child of [`PreTrainedTokenizer`]
+        max_seq_length: The maximum sequence length of the inputs.
+        doc_stride: The stride used when the context is too large and is split across several features.
+        max_query_length: The maximum length of the query.
+        is_training: whether to create features for model evaluation or model training.
+        padding_strategy: Default to "max_length". Which padding strategy to use
+        return_dataset: Default False. Either 'pt' or 'tf'.
+            if 'pt': returns a torch.data.TensorDataset, if 'tf': returns a tf.data.Dataset
+        threads: multiple processing threads.
+
+
+    Returns:
+        list of [`~data.processors.squad.SquadFeatures`]
+
+    Example:
+
+    ```python
+    processor = SquadV2Processor()
+    examples = processor.get_dev_examples(data_dir)
+
+    features = squad_convert_examples_to_features(
+        examples=examples,
+        tokenizer=tokenizer,
+        max_seq_length=args.max_seq_length,
+        doc_stride=args.doc_stride,
+        max_query_length=args.max_query_length,
+        is_training=not evaluate,
+    )
+    ```"""
+    # Defining helper methods
+    features = []
+
+    threads = min(threads, cpu_count())
+    with Pool(threads, initializer=squad_convert_example_to_features_init, initargs=(tokenizer,)) as p:
+        annotate_ = partial(
+            squad_convert_example_to_features,
+            max_seq_length=max_seq_length,
+            doc_stride=doc_stride,
+            max_query_length=max_query_length,
+            padding_strategy=padding_strategy,
+            is_training=is_training,
+        )
+        features = list(
+            tqdm(
+                p.imap(annotate_, examples, chunksize=32),
+                total=len(examples),
+                desc="convert squad examples to features",
+                disable=not tqdm_enabled,
+            )
+        )
+
+    new_features = []
+    unique_id = 1000000000
+    example_index = 0
+    for example_features in tqdm(
+        features, total=len(features), desc="add example index and unique id", disable=not tqdm_enabled
+    ):
+        if not example_features:
+            continue
+        for example_feature in example_features:
+            example_feature.example_index = example_index
+            example_feature.unique_id = unique_id
+            new_features.append(example_feature)
+            unique_id += 1
+        example_index += 1
+    features = new_features
+    del new_features
+    if return_dataset == "pt":
+        if not is_torch_available():
+            raise RuntimeError("PyTorch must be installed to return a PyTorch dataset.")
+
+        # Convert to Tensors and build dataset
+        all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
+        all_attention_masks = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
+        all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)
+        all_cls_index = torch.tensor([f.cls_index for f in features], dtype=torch.long)
+        all_p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.float)
+        all_is_impossible = torch.tensor([f.is_impossible for f in features], dtype=torch.float)
+
+        if not is_training:
+            all_feature_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
+            dataset = TensorDataset(
+                all_input_ids, all_attention_masks, all_token_type_ids, all_feature_index, all_cls_index, all_p_mask
+            )
+        else:
+            all_start_positions = torch.tensor([f.start_position for f in features], dtype=torch.long)
+            all_end_positions = torch.tensor([f.end_position for f in features], dtype=torch.long)
+            dataset = TensorDataset(
+                all_input_ids,
+                all_attention_masks,
+                all_token_type_ids,
+                all_start_positions,
+                all_end_positions,
+                all_cls_index,
+                all_p_mask,
+                all_is_impossible,
+            )
+
+        return features, dataset
+    elif return_dataset == "tf":
+        if not is_tf_available():
+            raise RuntimeError("TensorFlow must be installed to return a TensorFlow dataset.")
+
+        def gen():
+            for i, ex in enumerate(features):
+                if ex.token_type_ids is None:
+                    yield (
+                        {
+                            "input_ids": ex.input_ids,
+                            "attention_mask": ex.attention_mask,
+                            "feature_index": i,
+                            "qas_id": ex.qas_id,
+                        },
+                        {
+                            "start_positions": ex.start_position,
+                            "end_positions": ex.end_position,
+                            "cls_index": ex.cls_index,
+                            "p_mask": ex.p_mask,
+                            "is_impossible": ex.is_impossible,
+                        },
+                    )
+                else:
+                    yield (
+                        {
+                            "input_ids": ex.input_ids,
+                            "attention_mask": ex.attention_mask,
+                            "token_type_ids": ex.token_type_ids,
+                            "feature_index": i,
+                            "qas_id": ex.qas_id,
+                        },
+                        {
+                            "start_positions": ex.start_position,
+                            "end_positions": ex.end_position,
+                            "cls_index": ex.cls_index,
+                            "p_mask": ex.p_mask,
+                            "is_impossible": ex.is_impossible,
+                        },
+                    )
+
+        # Why have we split the batch into a tuple? PyTorch just has a list of tensors.
+        if "token_type_ids" in tokenizer.model_input_names:
+            train_types = (
+                {
+                    "input_ids": tf.int32,
+                    "attention_mask": tf.int32,
+                    "token_type_ids": tf.int32,
+                    "feature_index": tf.int64,
+                    "qas_id": tf.string,
+                },
+                {
+                    "start_positions": tf.int64,
+                    "end_positions": tf.int64,
+                    "cls_index": tf.int64,
+                    "p_mask": tf.int32,
+                    "is_impossible": tf.int32,
+                },
+            )
+
+            train_shapes = (
+                {
+                    "input_ids": tf.TensorShape([None]),
+                    "attention_mask": tf.TensorShape([None]),
+                    "token_type_ids": tf.TensorShape([None]),
+                    "feature_index": tf.TensorShape([]),
+                    "qas_id": tf.TensorShape([]),
+                },
+                {
+                    "start_positions": tf.TensorShape([]),
+                    "end_positions": tf.TensorShape([]),
+                    "cls_index": tf.TensorShape([]),
+                    "p_mask": tf.TensorShape([None]),
+                    "is_impossible": tf.TensorShape([]),
+                },
+            )
+        else:
+            train_types = (
+                {"input_ids": tf.int32, "attention_mask": tf.int32, "feature_index": tf.int64, "qas_id": tf.string},
+                {
+                    "start_positions": tf.int64,
+                    "end_positions": tf.int64,
+                    "cls_index": tf.int64,
+                    "p_mask": tf.int32,
+                    "is_impossible": tf.int32,
+                },
+            )
+
+            train_shapes = (
+                {
+                    "input_ids": tf.TensorShape([None]),
+                    "attention_mask": tf.TensorShape([None]),
+                    "feature_index": tf.TensorShape([]),
+                    "qas_id": tf.TensorShape([]),
+                },
+                {
+                    "start_positions": tf.TensorShape([]),
+                    "end_positions": tf.TensorShape([]),
+                    "cls_index": tf.TensorShape([]),
+                    "p_mask": tf.TensorShape([None]),
+                    "is_impossible": tf.TensorShape([]),
+                },
+            )
+
+        return tf.data.Dataset.from_generator(gen, train_types, train_shapes)
+    else:
+        return features
+
+
+class SquadProcessor(DataProcessor):
+    """
+    Processor for the SQuAD data set. overridden by SquadV1Processor and SquadV2Processor, used by the version 1.1 and
+    version 2.0 of SQuAD, respectively.
+    """
+
+    train_file = None
+    dev_file = None
+
+    def _get_example_from_tensor_dict(self, tensor_dict, evaluate=False):
+        if not evaluate:
+            answer = tensor_dict["answers"]["text"][0].numpy().decode("utf-8")
+            answer_start = tensor_dict["answers"]["answer_start"][0].numpy()
+            answers = []
+        else:
+            answers = [
+                {"answer_start": start.numpy(), "text": text.numpy().decode("utf-8")}
+                for start, text in zip(tensor_dict["answers"]["answer_start"], tensor_dict["answers"]["text"])
+            ]
+
+            answer = None
+            answer_start = None
+
+        return SquadExample(
+            qas_id=tensor_dict["id"].numpy().decode("utf-8"),
+            question_text=tensor_dict["question"].numpy().decode("utf-8"),
+            context_text=tensor_dict["context"].numpy().decode("utf-8"),
+            answer_text=answer,
+            start_position_character=answer_start,
+            title=tensor_dict["title"].numpy().decode("utf-8"),
+            answers=answers,
+        )
+
+    def get_examples_from_dataset(self, dataset, evaluate=False):
+        """
+        Creates a list of [`~data.processors.squad.SquadExample`] using a TFDS dataset.
+
+        Args:
+            dataset: The tfds dataset loaded from *tensorflow_datasets.load("squad")*
+            evaluate: Boolean specifying if in evaluation mode or in training mode
+
+        Returns:
+            List of SquadExample
+
+        Examples:
+
+        ```python
+        >>> import tensorflow_datasets as tfds
+
+        >>> dataset = tfds.load("squad")
+
+        >>> training_examples = get_examples_from_dataset(dataset, evaluate=False)
+        >>> evaluation_examples = get_examples_from_dataset(dataset, evaluate=True)
+        ```"""
+
+        if evaluate:
+            dataset = dataset["validation"]
+        else:
+            dataset = dataset["train"]
+
+        examples = []
+        for tensor_dict in tqdm(dataset):
+            examples.append(self._get_example_from_tensor_dict(tensor_dict, evaluate=evaluate))
+
+        return examples
+
+    def get_train_examples(self, data_dir, filename=None):
+        """
+        Returns the training examples from the data directory.
+
+        Args:
+            data_dir: Directory containing the data files used for training and evaluating.
+            filename: None by default, specify this if the training file has a different name than the original one
+                which is `train-v1.1.json` and `train-v2.0.json` for squad versions 1.1 and 2.0 respectively.
+
+        """
+        if data_dir is None:
+            data_dir = ""
+
+        if self.train_file is None:
+            raise ValueError("SquadProcessor should be instantiated via SquadV1Processor or SquadV2Processor")
+
+        with open(
+            os.path.join(data_dir, self.train_file if filename is None else filename), "r", encoding="utf-8"
+        ) as reader:
+            input_data = json.load(reader)["data"]
+        return self._create_examples(input_data, "train")
+
+    def get_dev_examples(self, data_dir, filename=None):
+        """
+        Returns the evaluation example from the data directory.
+
+        Args:
+            data_dir: Directory containing the data files used for training and evaluating.
+            filename: None by default, specify this if the evaluation file has a different name than the original one
+                which is `dev-v1.1.json` and `dev-v2.0.json` for squad versions 1.1 and 2.0 respectively.
+        """
+        if data_dir is None:
+            data_dir = ""
+
+        if self.dev_file is None:
+            raise ValueError("SquadProcessor should be instantiated via SquadV1Processor or SquadV2Processor")
+
+        with open(
+            os.path.join(data_dir, self.dev_file if filename is None else filename), "r", encoding="utf-8"
+        ) as reader:
+            input_data = json.load(reader)["data"]
+        return self._create_examples(input_data, "dev")
+
+    def _create_examples(self, input_data, set_type):
+        is_training = set_type == "train"
+        examples = []
+        for entry in tqdm(input_data):
+            title = entry["title"]
+            for paragraph in entry["paragraphs"]:
+                context_text = paragraph["context"]
+                for qa in paragraph["qas"]:
+                    qas_id = qa["id"]
+                    question_text = qa["question"]
+                    start_position_character = None
+                    answer_text = None
+                    answers = []
+
+                    is_impossible = qa.get("is_impossible", False)
+                    if not is_impossible:
+                        if is_training:
+                            answer = qa["answers"][0]
+                            answer_text = answer["text"]
+                            start_position_character = answer["answer_start"]
+                        else:
+                            answers = qa["answers"]
+
+                    example = SquadExample(
+                        qas_id=qas_id,
+                        question_text=question_text,
+                        context_text=context_text,
+                        answer_text=answer_text,
+                        start_position_character=start_position_character,
+                        title=title,
+                        is_impossible=is_impossible,
+                        answers=answers,
+                    )
+                    examples.append(example)
+        return examples
+
+
+class SquadV1Processor(SquadProcessor):
+    train_file = "train-v1.1.json"
+    dev_file = "dev-v1.1.json"
+
+
+class SquadV2Processor(SquadProcessor):
+    train_file = "train-v2.0.json"
+    dev_file = "dev-v2.0.json"
+
+
+class SquadExample:
+    """
+    A single training/test example for the Squad dataset, as loaded from disk.
+
+    Args:
+        qas_id: The example's unique identifier
+        question_text: The question string
+        context_text: The context string
+        answer_text: The answer string
+        start_position_character: The character position of the start of the answer
+        title: The title of the example
+        answers: None by default, this is used during evaluation. Holds answers as well as their start positions.
+        is_impossible: False by default, set to True if the example has no possible answer.
+    """
+
+    def __init__(
+        self,
+        qas_id,
+        question_text,
+        context_text,
+        answer_text,
+        start_position_character,
+        title,
+        answers=[],
+        is_impossible=False,
+    ):
+        self.qas_id = qas_id
+        self.question_text = question_text
+        self.context_text = context_text
+        self.answer_text = answer_text
+        self.title = title
+        self.is_impossible = is_impossible
+        self.answers = answers
+
+        self.start_position, self.end_position = 0, 0
+
+        doc_tokens = []
+        char_to_word_offset = []
+        prev_is_whitespace = True
+
+        # Split on whitespace so that different tokens may be attributed to their original position.
+        for c in self.context_text:
+            if _is_whitespace(c):
+                prev_is_whitespace = True
+            else:
+                if prev_is_whitespace:
+                    doc_tokens.append(c)
+                else:
+                    doc_tokens[-1] += c
+                prev_is_whitespace = False
+            char_to_word_offset.append(len(doc_tokens) - 1)
+
+        self.doc_tokens = doc_tokens
+        self.char_to_word_offset = char_to_word_offset
+
+        # Start and end positions only has a value during evaluation.
+        if start_position_character is not None and not is_impossible:
+            self.start_position = char_to_word_offset[start_position_character]
+            self.end_position = char_to_word_offset[
+                min(start_position_character + len(answer_text) - 1, len(char_to_word_offset) - 1)
+            ]
+
+
+class SquadFeatures:
+    """
+    Single squad example features to be fed to a model. Those features are model-specific and can be crafted from
+    [`~data.processors.squad.SquadExample`] using the
+    :method:*~transformers.data.processors.squad.squad_convert_examples_to_features* method.
+
+    Args:
+        input_ids: Indices of input sequence tokens in the vocabulary.
+        attention_mask: Mask to avoid performing attention on padding token indices.
+        token_type_ids: Segment token indices to indicate first and second portions of the inputs.
+        cls_index: the index of the CLS token.
+        p_mask: Mask identifying tokens that can be answers vs. tokens that cannot.
+            Mask with 1 for tokens than cannot be in the answer and 0 for token that can be in an answer
+        example_index: the index of the example
+        unique_id: The unique Feature identifier
+        paragraph_len: The length of the context
+        token_is_max_context:
+            List of booleans identifying which tokens have their maximum context in this feature object. If a token
+            does not have their maximum context in this feature object, it means that another feature object has more
+            information related to that token and should be prioritized over this feature for that token.
+        tokens: list of tokens corresponding to the input ids
+        token_to_orig_map: mapping between the tokens and the original text, needed in order to identify the answer.
+        start_position: start of the answer token index
+        end_position: end of the answer token index
+        encoding: optionally store the BatchEncoding with the fast-tokenizer alignment methods.
+    """
+
+    def __init__(
+        self,
+        input_ids,
+        attention_mask,
+        token_type_ids,
+        cls_index,
+        p_mask,
+        example_index,
+        unique_id,
+        paragraph_len,
+        token_is_max_context,
+        tokens,
+        token_to_orig_map,
+        start_position,
+        end_position,
+        is_impossible,
+        qas_id: str = None,
+        encoding: BatchEncoding = None,
+    ):
+        self.input_ids = input_ids
+        self.attention_mask = attention_mask
+        self.token_type_ids = token_type_ids
+        self.cls_index = cls_index
+        self.p_mask = p_mask
+
+        self.example_index = example_index
+        self.unique_id = unique_id
+        self.paragraph_len = paragraph_len
+        self.token_is_max_context = token_is_max_context
+        self.tokens = tokens
+        self.token_to_orig_map = token_to_orig_map
+
+        self.start_position = start_position
+        self.end_position = end_position
+        self.is_impossible = is_impossible
+        self.qas_id = qas_id
+
+        self.encoding = encoding
+
+
+class SquadResult:
+    """
+    Constructs a SquadResult which can be used to evaluate a model's output on the SQuAD dataset.
+
+    Args:
+        unique_id: The unique identifier corresponding to that example.
+        start_logits: The logits corresponding to the start of the answer
+        end_logits: The logits corresponding to the end of the answer
+    """
+
+    def __init__(self, unique_id, start_logits, end_logits, start_top_index=None, end_top_index=None, cls_logits=None):
+        self.start_logits = start_logits
+        self.end_logits = end_logits
+        self.unique_id = unique_id
+
+        if start_top_index:
+            self.start_top_index = start_top_index
+            self.end_top_index = end_top_index
+            self.cls_logits = cls_logits
diff --git a/.venv/lib/python3.11/site-packages/transformers/data/processors/utils.py b/.venv/lib/python3.11/site-packages/transformers/data/processors/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..936f5a51e9fcf4c4189eb444e567d761e8fa0865
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/data/processors/utils.py
@@ -0,0 +1,349 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import csv
+import dataclasses
+import json
+from dataclasses import dataclass
+from typing import List, Optional, Union
+
+from ...utils import is_tf_available, is_torch_available, logging
+
+
+logger = logging.get_logger(__name__)
+
+
+@dataclass
+class InputExample:
+    """
+    A single training/test example for simple sequence classification.
+
+    Args:
+        guid: Unique id for the example.
+        text_a: string. The untokenized text of the first sequence. For single
+            sequence tasks, only this sequence must be specified.
+        text_b: (Optional) string. The untokenized text of the second sequence.
+            Only must be specified for sequence pair tasks.
+        label: (Optional) string. The label of the example. This should be
+            specified for train and dev examples, but not for test examples.
+    """
+
+    guid: str
+    text_a: str
+    text_b: Optional[str] = None
+    label: Optional[str] = None
+
+    def to_json_string(self):
+        """Serializes this instance to a JSON string."""
+        return json.dumps(dataclasses.asdict(self), indent=2) + "\n"
+
+
+@dataclass(frozen=True)
+class InputFeatures:
+    """
+    A single set of features of data. Property names are the same names as the corresponding inputs to a model.
+
+    Args:
+        input_ids: Indices of input sequence tokens in the vocabulary.
+        attention_mask: Mask to avoid performing attention on padding token indices.
+            Mask values selected in `[0, 1]`: Usually `1` for tokens that are NOT MASKED, `0` for MASKED (padded)
+            tokens.
+        token_type_ids: (Optional) Segment token indices to indicate first and second
+            portions of the inputs. Only some models use them.
+        label: (Optional) Label corresponding to the input. Int for classification problems,
+            float for regression problems.
+    """
+
+    input_ids: List[int]
+    attention_mask: Optional[List[int]] = None
+    token_type_ids: Optional[List[int]] = None
+    label: Optional[Union[int, float]] = None
+
+    def to_json_string(self):
+        """Serializes this instance to a JSON string."""
+        return json.dumps(dataclasses.asdict(self)) + "\n"
+
+
+class DataProcessor:
+    """Base class for data converters for sequence classification data sets."""
+
+    def get_example_from_tensor_dict(self, tensor_dict):
+        """
+        Gets an example from a dict with tensorflow tensors.
+
+        Args:
+            tensor_dict: Keys and values should match the corresponding Glue
+                tensorflow_dataset examples.
+        """
+        raise NotImplementedError()
+
+    def get_train_examples(self, data_dir):
+        """Gets a collection of [`InputExample`] for the train set."""
+        raise NotImplementedError()
+
+    def get_dev_examples(self, data_dir):
+        """Gets a collection of [`InputExample`] for the dev set."""
+        raise NotImplementedError()
+
+    def get_test_examples(self, data_dir):
+        """Gets a collection of [`InputExample`] for the test set."""
+        raise NotImplementedError()
+
+    def get_labels(self):
+        """Gets the list of labels for this data set."""
+        raise NotImplementedError()
+
+    def tfds_map(self, example):
+        """
+        Some tensorflow_datasets datasets are not formatted the same way the GLUE datasets are. This method converts
+        examples to the correct format.
+        """
+        if len(self.get_labels()) > 1:
+            example.label = self.get_labels()[int(example.label)]
+        return example
+
+    @classmethod
+    def _read_tsv(cls, input_file, quotechar=None):
+        """Reads a tab separated value file."""
+        with open(input_file, "r", encoding="utf-8-sig") as f:
+            return list(csv.reader(f, delimiter="\t", quotechar=quotechar))
+
+
+class SingleSentenceClassificationProcessor(DataProcessor):
+    """Generic processor for a single sentence classification data set."""
+
+    def __init__(self, labels=None, examples=None, mode="classification", verbose=False):
+        self.labels = [] if labels is None else labels
+        self.examples = [] if examples is None else examples
+        self.mode = mode
+        self.verbose = verbose
+
+    def __len__(self):
+        return len(self.examples)
+
+    def __getitem__(self, idx):
+        if isinstance(idx, slice):
+            return SingleSentenceClassificationProcessor(labels=self.labels, examples=self.examples[idx])
+        return self.examples[idx]
+
+    @classmethod
+    def create_from_csv(
+        cls, file_name, split_name="", column_label=0, column_text=1, column_id=None, skip_first_row=False, **kwargs
+    ):
+        processor = cls(**kwargs)
+        processor.add_examples_from_csv(
+            file_name,
+            split_name=split_name,
+            column_label=column_label,
+            column_text=column_text,
+            column_id=column_id,
+            skip_first_row=skip_first_row,
+            overwrite_labels=True,
+            overwrite_examples=True,
+        )
+        return processor
+
+    @classmethod
+    def create_from_examples(cls, texts_or_text_and_labels, labels=None, **kwargs):
+        processor = cls(**kwargs)
+        processor.add_examples(texts_or_text_and_labels, labels=labels)
+        return processor
+
+    def add_examples_from_csv(
+        self,
+        file_name,
+        split_name="",
+        column_label=0,
+        column_text=1,
+        column_id=None,
+        skip_first_row=False,
+        overwrite_labels=False,
+        overwrite_examples=False,
+    ):
+        lines = self._read_tsv(file_name)
+        if skip_first_row:
+            lines = lines[1:]
+        texts = []
+        labels = []
+        ids = []
+        for i, line in enumerate(lines):
+            texts.append(line[column_text])
+            labels.append(line[column_label])
+            if column_id is not None:
+                ids.append(line[column_id])
+            else:
+                guid = f"{split_name}-{i}" if split_name else str(i)
+                ids.append(guid)
+
+        return self.add_examples(
+            texts, labels, ids, overwrite_labels=overwrite_labels, overwrite_examples=overwrite_examples
+        )
+
+    def add_examples(
+        self, texts_or_text_and_labels, labels=None, ids=None, overwrite_labels=False, overwrite_examples=False
+    ):
+        if labels is not None and len(texts_or_text_and_labels) != len(labels):
+            raise ValueError(
+                f"Text and labels have mismatched lengths {len(texts_or_text_and_labels)} and {len(labels)}"
+            )
+        if ids is not None and len(texts_or_text_and_labels) != len(ids):
+            raise ValueError(f"Text and ids have mismatched lengths {len(texts_or_text_and_labels)} and {len(ids)}")
+        if ids is None:
+            ids = [None] * len(texts_or_text_and_labels)
+        if labels is None:
+            labels = [None] * len(texts_or_text_and_labels)
+        examples = []
+        added_labels = set()
+        for text_or_text_and_label, label, guid in zip(texts_or_text_and_labels, labels, ids):
+            if isinstance(text_or_text_and_label, (tuple, list)) and label is None:
+                text, label = text_or_text_and_label
+            else:
+                text = text_or_text_and_label
+            added_labels.add(label)
+            examples.append(InputExample(guid=guid, text_a=text, text_b=None, label=label))
+
+        # Update examples
+        if overwrite_examples:
+            self.examples = examples
+        else:
+            self.examples.extend(examples)
+
+        # Update labels
+        if overwrite_labels:
+            self.labels = list(added_labels)
+        else:
+            self.labels = list(set(self.labels).union(added_labels))
+
+        return self.examples
+
+    def get_features(
+        self,
+        tokenizer,
+        max_length=None,
+        pad_on_left=False,
+        pad_token=0,
+        mask_padding_with_zero=True,
+        return_tensors=None,
+    ):
+        """
+        Convert examples in a list of `InputFeatures`
+
+        Args:
+            tokenizer: Instance of a tokenizer that will tokenize the examples
+            max_length: Maximum example length
+            pad_on_left: If set to `True`, the examples will be padded on the left rather than on the right (default)
+            pad_token: Padding token
+            mask_padding_with_zero: If set to `True`, the attention mask will be filled by `1` for actual values
+                and by `0` for padded values. If set to `False`, inverts it (`1` for padded values, `0` for actual
+                values)
+
+        Returns:
+            If the `examples` input is a `tf.data.Dataset`, will return a `tf.data.Dataset` containing the
+            task-specific features. If the input is a list of `InputExamples`, will return a list of task-specific
+            `InputFeatures` which can be fed to the model.
+
+        """
+        if max_length is None:
+            max_length = tokenizer.max_len
+
+        label_map = {label: i for i, label in enumerate(self.labels)}
+
+        all_input_ids = []
+        for ex_index, example in enumerate(self.examples):
+            if ex_index % 10000 == 0:
+                logger.info(f"Tokenizing example {ex_index}")
+
+            input_ids = tokenizer.encode(
+                example.text_a,
+                add_special_tokens=True,
+                max_length=min(max_length, tokenizer.max_len),
+            )
+            all_input_ids.append(input_ids)
+
+        batch_length = max(len(input_ids) for input_ids in all_input_ids)
+
+        features = []
+        for ex_index, (input_ids, example) in enumerate(zip(all_input_ids, self.examples)):
+            if ex_index % 10000 == 0:
+                logger.info(f"Writing example {ex_index}/{len(self.examples)}")
+            # The mask has 1 for real tokens and 0 for padding tokens. Only real
+            # tokens are attended to.
+            attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)
+
+            # Zero-pad up to the sequence length.
+            padding_length = batch_length - len(input_ids)
+            if pad_on_left:
+                input_ids = ([pad_token] * padding_length) + input_ids
+                attention_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + attention_mask
+            else:
+                input_ids = input_ids + ([pad_token] * padding_length)
+                attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
+
+            if len(input_ids) != batch_length:
+                raise ValueError(f"Error with input length {len(input_ids)} vs {batch_length}")
+            if len(attention_mask) != batch_length:
+                raise ValueError(f"Error with input length {len(attention_mask)} vs {batch_length}")
+
+            if self.mode == "classification":
+                label = label_map[example.label]
+            elif self.mode == "regression":
+                label = float(example.label)
+            else:
+                raise ValueError(self.mode)
+
+            if ex_index < 5 and self.verbose:
+                logger.info("*** Example ***")
+                logger.info(f"guid: {example.guid}")
+                logger.info(f"input_ids: {' '.join([str(x) for x in input_ids])}")
+                logger.info(f"attention_mask: {' '.join([str(x) for x in attention_mask])}")
+                logger.info(f"label: {example.label} (id = {label})")
+
+            features.append(InputFeatures(input_ids=input_ids, attention_mask=attention_mask, label=label))
+
+        if return_tensors is None:
+            return features
+        elif return_tensors == "tf":
+            if not is_tf_available():
+                raise RuntimeError("return_tensors set to 'tf' but TensorFlow 2.0 can't be imported")
+            import tensorflow as tf
+
+            def gen():
+                for ex in features:
+                    yield ({"input_ids": ex.input_ids, "attention_mask": ex.attention_mask}, ex.label)
+
+            dataset = tf.data.Dataset.from_generator(
+                gen,
+                ({"input_ids": tf.int32, "attention_mask": tf.int32}, tf.int64),
+                ({"input_ids": tf.TensorShape([None]), "attention_mask": tf.TensorShape([None])}, tf.TensorShape([])),
+            )
+            return dataset
+        elif return_tensors == "pt":
+            if not is_torch_available():
+                raise RuntimeError("return_tensors set to 'pt' but PyTorch can't be imported")
+            import torch
+            from torch.utils.data import TensorDataset
+
+            all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
+            all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
+            if self.mode == "classification":
+                all_labels = torch.tensor([f.label for f in features], dtype=torch.long)
+            elif self.mode == "regression":
+                all_labels = torch.tensor([f.label for f in features], dtype=torch.float)
+
+            dataset = TensorDataset(all_input_ids, all_attention_mask, all_labels)
+            return dataset
+        else:
+            raise ValueError("return_tensors should be one of 'tf' or 'pt'")
diff --git a/.venv/lib/python3.11/site-packages/transformers/data/processors/xnli.py b/.venv/lib/python3.11/site-packages/transformers/data/processors/xnli.py
new file mode 100644
index 0000000000000000000000000000000000000000..4d8ec17a8345db5bf08325a334a4c6eb8af29157
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/data/processors/xnli.py
@@ -0,0 +1,96 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""XNLI utils (dataset loading and evaluation)"""
+
+import os
+
+from ...utils import logging
+from .utils import DataProcessor, InputExample
+
+
+logger = logging.get_logger(__name__)
+
+
+class XnliProcessor(DataProcessor):
+    """
+    Processor for the XNLI dataset. Adapted from
+    https://github.com/google-research/bert/blob/f39e881b169b9d53bea03d2d341b31707a6c052b/run_classifier.py#L207
+    """
+
+    def __init__(self, language, train_language=None):
+        self.language = language
+        self.train_language = train_language
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        lg = self.language if self.train_language is None else self.train_language
+        lines = self._read_tsv(os.path.join(data_dir, f"XNLI-MT-1.0/multinli/multinli.train.{lg}.tsv"))
+        examples = []
+        for i, line in enumerate(lines):
+            if i == 0:
+                continue
+            guid = f"train-{i}"
+            text_a = line[0]
+            text_b = line[1]
+            label = "contradiction" if line[2] == "contradictory" else line[2]
+            if not isinstance(text_a, str):
+                raise TypeError(f"Training input {text_a} is not a string")
+            if not isinstance(text_b, str):
+                raise TypeError(f"Training input {text_b} is not a string")
+            if not isinstance(label, str):
+                raise TypeError(f"Training label {label} is not a string")
+            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+        return examples
+
+    def get_test_examples(self, data_dir):
+        """See base class."""
+        lines = self._read_tsv(os.path.join(data_dir, "XNLI-1.0/xnli.test.tsv"))
+        examples = []
+        for i, line in enumerate(lines):
+            if i == 0:
+                continue
+            language = line[0]
+            if language != self.language:
+                continue
+            guid = f"test-{i}"
+            text_a = line[6]
+            text_b = line[7]
+            label = line[1]
+            if not isinstance(text_a, str):
+                raise TypeError(f"Training input {text_a} is not a string")
+            if not isinstance(text_b, str):
+                raise TypeError(f"Training input {text_b} is not a string")
+            if not isinstance(label, str):
+                raise TypeError(f"Training label {label} is not a string")
+            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+        return examples
+
+    def get_labels(self):
+        """See base class."""
+        return ["contradiction", "entailment", "neutral"]
+
+
+xnli_processors = {
+    "xnli": XnliProcessor,
+}
+
+xnli_output_modes = {
+    "xnli": "classification",
+}
+
+xnli_tasks_num_labels = {
+    "xnli": 3,
+}
diff --git a/.venv/lib/python3.11/site-packages/transformers/generation/__init__.py b/.venv/lib/python3.11/site-packages/transformers/generation/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d3eb10c1e6b355875c6cddb86ca53022549a8333
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/generation/__init__.py
@@ -0,0 +1,352 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING
+
+from ..utils import OptionalDependencyNotAvailable, _LazyModule, is_flax_available, is_tf_available, is_torch_available
+
+
+_import_structure = {
+    "configuration_utils": [
+        "BaseWatermarkingConfig",
+        "CompileConfig",
+        "GenerationConfig",
+        "GenerationMode",
+        "SynthIDTextWatermarkingConfig",
+        "WatermarkingConfig",
+    ],
+    "streamers": ["AsyncTextIteratorStreamer", "TextIteratorStreamer", "TextStreamer"],
+}
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["beam_constraints"] = [
+        "Constraint",
+        "ConstraintListState",
+        "DisjunctiveConstraint",
+        "PhrasalConstraint",
+    ]
+    _import_structure["beam_search"] = [
+        "BeamHypotheses",
+        "BeamScorer",
+        "BeamSearchScorer",
+        "ConstrainedBeamSearchScorer",
+    ]
+    _import_structure["candidate_generator"] = [
+        "AssistedCandidateGenerator",
+        "CandidateGenerator",
+        "EarlyExitCandidateGenerator",
+        "PromptLookupCandidateGenerator",
+    ]
+    _import_structure["logits_process"] = [
+        "AlternatingCodebooksLogitsProcessor",
+        "ClassifierFreeGuidanceLogitsProcessor",
+        "EncoderNoRepeatNGramLogitsProcessor",
+        "EncoderRepetitionPenaltyLogitsProcessor",
+        "EpsilonLogitsWarper",
+        "EtaLogitsWarper",
+        "ExponentialDecayLengthPenalty",
+        "ForcedBOSTokenLogitsProcessor",
+        "ForcedEOSTokenLogitsProcessor",
+        "HammingDiversityLogitsProcessor",
+        "InfNanRemoveLogitsProcessor",
+        "LogitNormalization",
+        "LogitsProcessor",
+        "LogitsProcessorList",
+        "LogitsWarper",
+        "MinLengthLogitsProcessor",
+        "MinNewTokensLengthLogitsProcessor",
+        "MinPLogitsWarper",
+        "NoBadWordsLogitsProcessor",
+        "NoRepeatNGramLogitsProcessor",
+        "PrefixConstrainedLogitsProcessor",
+        "RepetitionPenaltyLogitsProcessor",
+        "SequenceBiasLogitsProcessor",
+        "SuppressTokensLogitsProcessor",
+        "SuppressTokensAtBeginLogitsProcessor",
+        "SynthIDTextWatermarkLogitsProcessor",
+        "TemperatureLogitsWarper",
+        "TopKLogitsWarper",
+        "TopPLogitsWarper",
+        "TypicalLogitsWarper",
+        "UnbatchedClassifierFreeGuidanceLogitsProcessor",
+        "WhisperTimeStampLogitsProcessor",
+        "WatermarkLogitsProcessor",
+    ]
+    _import_structure["stopping_criteria"] = [
+        "MaxNewTokensCriteria",
+        "MaxLengthCriteria",
+        "MaxTimeCriteria",
+        "ConfidenceCriteria",
+        "EosTokenCriteria",
+        "StoppingCriteria",
+        "StoppingCriteriaList",
+        "validate_stopping_criteria",
+        "StopStringCriteria",
+    ]
+    _import_structure["utils"] = [
+        "GenerationMixin",
+        "GreedySearchEncoderDecoderOutput",
+        "GreedySearchDecoderOnlyOutput",
+        "SampleEncoderDecoderOutput",
+        "SampleDecoderOnlyOutput",
+        "BeamSearchEncoderDecoderOutput",
+        "BeamSearchDecoderOnlyOutput",
+        "BeamSampleEncoderDecoderOutput",
+        "BeamSampleDecoderOnlyOutput",
+        "ContrastiveSearchEncoderDecoderOutput",
+        "ContrastiveSearchDecoderOnlyOutput",
+        "GenerateBeamDecoderOnlyOutput",
+        "GenerateBeamEncoderDecoderOutput",
+        "GenerateDecoderOnlyOutput",
+        "GenerateEncoderDecoderOutput",
+    ]
+    _import_structure["watermarking"] = [
+        "WatermarkDetector",
+        "WatermarkDetectorOutput",
+        "BayesianDetectorModel",
+        "BayesianDetectorConfig",
+        "SynthIDTextWatermarkDetector",
+    ]
+
+try:
+    if not is_tf_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["tf_logits_process"] = [
+        "TFForcedBOSTokenLogitsProcessor",
+        "TFForcedEOSTokenLogitsProcessor",
+        "TFForceTokensLogitsProcessor",
+        "TFLogitsProcessor",
+        "TFLogitsProcessorList",
+        "TFLogitsWarper",
+        "TFMinLengthLogitsProcessor",
+        "TFNoBadWordsLogitsProcessor",
+        "TFNoRepeatNGramLogitsProcessor",
+        "TFRepetitionPenaltyLogitsProcessor",
+        "TFSuppressTokensAtBeginLogitsProcessor",
+        "TFSuppressTokensLogitsProcessor",
+        "TFTemperatureLogitsWarper",
+        "TFTopKLogitsWarper",
+        "TFTopPLogitsWarper",
+    ]
+    _import_structure["tf_utils"] = [
+        "TFGenerationMixin",
+        "TFGreedySearchDecoderOnlyOutput",
+        "TFGreedySearchEncoderDecoderOutput",
+        "TFSampleEncoderDecoderOutput",
+        "TFSampleDecoderOnlyOutput",
+        "TFBeamSearchEncoderDecoderOutput",
+        "TFBeamSearchDecoderOnlyOutput",
+        "TFBeamSampleEncoderDecoderOutput",
+        "TFBeamSampleDecoderOnlyOutput",
+        "TFContrastiveSearchEncoderDecoderOutput",
+        "TFContrastiveSearchDecoderOnlyOutput",
+    ]
+
+try:
+    if not is_flax_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["flax_logits_process"] = [
+        "FlaxForcedBOSTokenLogitsProcessor",
+        "FlaxForcedEOSTokenLogitsProcessor",
+        "FlaxForceTokensLogitsProcessor",
+        "FlaxLogitsProcessor",
+        "FlaxLogitsProcessorList",
+        "FlaxLogitsWarper",
+        "FlaxMinLengthLogitsProcessor",
+        "FlaxSuppressTokensAtBeginLogitsProcessor",
+        "FlaxSuppressTokensLogitsProcessor",
+        "FlaxTemperatureLogitsWarper",
+        "FlaxTopKLogitsWarper",
+        "FlaxTopPLogitsWarper",
+        "FlaxWhisperTimeStampLogitsProcessor",
+        "FlaxNoRepeatNGramLogitsProcessor",
+    ]
+    _import_structure["flax_utils"] = [
+        "FlaxGenerationMixin",
+        "FlaxGreedySearchOutput",
+        "FlaxSampleOutput",
+        "FlaxBeamSearchOutput",
+    ]
+
+if TYPE_CHECKING:
+    from .configuration_utils import (
+        BaseWatermarkingConfig,
+        CompileConfig,
+        GenerationConfig,
+        GenerationMode,
+        SynthIDTextWatermarkingConfig,
+        WatermarkingConfig,
+    )
+    from .streamers import AsyncTextIteratorStreamer, TextIteratorStreamer, TextStreamer
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .beam_constraints import Constraint, ConstraintListState, DisjunctiveConstraint, PhrasalConstraint
+        from .beam_search import BeamHypotheses, BeamScorer, BeamSearchScorer, ConstrainedBeamSearchScorer
+        from .candidate_generator import (
+            AssistedCandidateGenerator,
+            CandidateGenerator,
+            EarlyExitCandidateGenerator,
+            PromptLookupCandidateGenerator,
+        )
+        from .logits_process import (
+            AlternatingCodebooksLogitsProcessor,
+            ClassifierFreeGuidanceLogitsProcessor,
+            EncoderNoRepeatNGramLogitsProcessor,
+            EncoderRepetitionPenaltyLogitsProcessor,
+            EpsilonLogitsWarper,
+            EtaLogitsWarper,
+            ExponentialDecayLengthPenalty,
+            ForcedBOSTokenLogitsProcessor,
+            ForcedEOSTokenLogitsProcessor,
+            HammingDiversityLogitsProcessor,
+            InfNanRemoveLogitsProcessor,
+            LogitNormalization,
+            LogitsProcessor,
+            LogitsProcessorList,
+            LogitsWarper,
+            MinLengthLogitsProcessor,
+            MinNewTokensLengthLogitsProcessor,
+            MinPLogitsWarper,
+            NoBadWordsLogitsProcessor,
+            NoRepeatNGramLogitsProcessor,
+            PrefixConstrainedLogitsProcessor,
+            RepetitionPenaltyLogitsProcessor,
+            SequenceBiasLogitsProcessor,
+            SuppressTokensAtBeginLogitsProcessor,
+            SuppressTokensLogitsProcessor,
+            SynthIDTextWatermarkLogitsProcessor,
+            TemperatureLogitsWarper,
+            TopKLogitsWarper,
+            TopPLogitsWarper,
+            TypicalLogitsWarper,
+            UnbatchedClassifierFreeGuidanceLogitsProcessor,
+            WatermarkLogitsProcessor,
+            WhisperTimeStampLogitsProcessor,
+        )
+        from .stopping_criteria import (
+            ConfidenceCriteria,
+            EosTokenCriteria,
+            MaxLengthCriteria,
+            MaxNewTokensCriteria,
+            MaxTimeCriteria,
+            StoppingCriteria,
+            StoppingCriteriaList,
+            StopStringCriteria,
+            validate_stopping_criteria,
+        )
+        from .utils import (
+            BeamSampleDecoderOnlyOutput,
+            BeamSampleEncoderDecoderOutput,
+            BeamSearchDecoderOnlyOutput,
+            BeamSearchEncoderDecoderOutput,
+            ContrastiveSearchDecoderOnlyOutput,
+            ContrastiveSearchEncoderDecoderOutput,
+            GenerateBeamDecoderOnlyOutput,
+            GenerateBeamEncoderDecoderOutput,
+            GenerateDecoderOnlyOutput,
+            GenerateEncoderDecoderOutput,
+            GenerationMixin,
+            GreedySearchDecoderOnlyOutput,
+            GreedySearchEncoderDecoderOutput,
+            SampleDecoderOnlyOutput,
+            SampleEncoderDecoderOutput,
+        )
+        from .watermarking import (
+            BayesianDetectorConfig,
+            BayesianDetectorModel,
+            SynthIDTextWatermarkDetector,
+            WatermarkDetector,
+            WatermarkDetectorOutput,
+        )
+
+    try:
+        if not is_tf_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .tf_logits_process import (
+            TFForcedBOSTokenLogitsProcessor,
+            TFForcedEOSTokenLogitsProcessor,
+            TFForceTokensLogitsProcessor,
+            TFLogitsProcessor,
+            TFLogitsProcessorList,
+            TFLogitsWarper,
+            TFMinLengthLogitsProcessor,
+            TFNoBadWordsLogitsProcessor,
+            TFNoRepeatNGramLogitsProcessor,
+            TFRepetitionPenaltyLogitsProcessor,
+            TFSuppressTokensAtBeginLogitsProcessor,
+            TFSuppressTokensLogitsProcessor,
+            TFTemperatureLogitsWarper,
+            TFTopKLogitsWarper,
+            TFTopPLogitsWarper,
+        )
+        from .tf_utils import (
+            TFBeamSampleDecoderOnlyOutput,
+            TFBeamSampleEncoderDecoderOutput,
+            TFBeamSearchDecoderOnlyOutput,
+            TFBeamSearchEncoderDecoderOutput,
+            TFContrastiveSearchDecoderOnlyOutput,
+            TFContrastiveSearchEncoderDecoderOutput,
+            TFGenerationMixin,
+            TFGreedySearchDecoderOnlyOutput,
+            TFGreedySearchEncoderDecoderOutput,
+            TFSampleDecoderOnlyOutput,
+            TFSampleEncoderDecoderOutput,
+        )
+
+    try:
+        if not is_flax_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .flax_logits_process import (
+            FlaxForcedBOSTokenLogitsProcessor,
+            FlaxForcedEOSTokenLogitsProcessor,
+            FlaxForceTokensLogitsProcessor,
+            FlaxLogitsProcessor,
+            FlaxLogitsProcessorList,
+            FlaxLogitsWarper,
+            FlaxMinLengthLogitsProcessor,
+            FlaxNoRepeatNGramLogitsProcessor,
+            FlaxSuppressTokensAtBeginLogitsProcessor,
+            FlaxSuppressTokensLogitsProcessor,
+            FlaxTemperatureLogitsWarper,
+            FlaxTopKLogitsWarper,
+            FlaxTopPLogitsWarper,
+            FlaxWhisperTimeStampLogitsProcessor,
+        )
+        from .flax_utils import FlaxBeamSearchOutput, FlaxGenerationMixin, FlaxGreedySearchOutput, FlaxSampleOutput
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/.venv/lib/python3.11/site-packages/transformers/generation/__pycache__/candidate_generator.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/generation/__pycache__/candidate_generator.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0e6550f95a2def445ae11a9d583a95571253a8e5
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/generation/__pycache__/candidate_generator.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/generation/__pycache__/configuration_utils.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/generation/__pycache__/configuration_utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..398f98c3257260678e64168b74039487c4b927de
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/generation/__pycache__/configuration_utils.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/generation/__pycache__/flax_logits_process.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/generation/__pycache__/flax_logits_process.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7c52d96f4257afdf823c53190d92374893efbf8a
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/generation/__pycache__/flax_logits_process.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/generation/__pycache__/flax_utils.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/generation/__pycache__/flax_utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..207d5ad11b589b5fce6b10f75e471f3387d72a3b
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/generation/__pycache__/flax_utils.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/generation/__pycache__/stopping_criteria.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/generation/__pycache__/stopping_criteria.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4f8e96f9afc94c1f1711b7d31d50a62855c4e133
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/generation/__pycache__/stopping_criteria.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/generation/__pycache__/streamers.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/generation/__pycache__/streamers.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..426f5509b4532f00409f9a60cf9dc05b355fb52a
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/generation/__pycache__/streamers.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/generation/__pycache__/tf_logits_process.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/generation/__pycache__/tf_logits_process.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..af8eff618049840da5fadd1a8f0ccc94cb3c3c46
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/generation/__pycache__/tf_logits_process.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/generation/__pycache__/watermarking.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/generation/__pycache__/watermarking.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a47bda7d8357069d019dfbc45b228847e9dc902a
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/generation/__pycache__/watermarking.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/generation/beam_constraints.py b/.venv/lib/python3.11/site-packages/transformers/generation/beam_constraints.py
new file mode 100644
index 0000000000000000000000000000000000000000..daf64209b796772ac5c8da0dc139d944c2e82430
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/generation/beam_constraints.py
@@ -0,0 +1,524 @@
+from abc import ABC, abstractmethod
+from typing import List, Optional
+
+
+class Constraint(ABC):
+    r"""Abstract base class for all constraints that can be applied during generation.
+    It must define how the constraint can be satisfied.
+
+    All classes that inherit Constraint must follow the requirement that
+
+    ```py
+    completed = False
+    while not completed:
+        _, completed = constraint.update(constraint.advance())
+    ```
+
+    will always terminate (halt).
+    """
+
+    def __init__(self):
+        # test for the above condition
+        self.test()
+
+    def test(self):
+        """
+        Tests whether this constraint has been properly defined.
+        """
+        counter = 0
+        completed = False
+        while not completed:
+            if counter == 1:
+                self.reset()
+            advance = self.advance()
+            if not self.does_advance(advance):
+                raise Exception(
+                    "Custom Constraint is not defined correctly. self.does_advance(self.advance()) must be true."
+                )
+
+            stepped, completed, reset = self.update(advance)
+            counter += 1
+
+            if counter > 10000:
+                raise Exception("update() does not fulfill the constraint.")
+
+        if self.remaining() != 0:
+            raise Exception("Custom Constraint is not defined correctly.")
+
+    @abstractmethod
+    def advance(self):
+        """
+        When called, returns the token(s) that would take this constraint one step closer to being fulfilled.
+
+        Return:
+            token_ids (Union[int, List[int], None]):
+                - A single token ID (int) that advances the constraint, or
+                - A list of token IDs that could advance the constraint
+                - None if the constraint is completed or cannot be advanced
+        """
+        raise NotImplementedError(
+            f"{self.__class__} is an abstract class. Only classes inheriting this class can be called."
+        )
+
+    @abstractmethod
+    def does_advance(self, token_id: int):
+        """
+        Reads in a token and returns whether it creates progress.
+        """
+        raise NotImplementedError(
+            f"{self.__class__} is an abstract class. Only classes inheriting this class can be called."
+        )
+
+    @abstractmethod
+    def update(self, token_id: int):
+        """
+        Reads in a token and returns booleans that indicate the progress made by it. This function will update the
+        state of this object unlikes `does_advance(self, token_id: int)`.
+
+        This isn't to test whether a certain token will advance the progress; it's to update its state as if it has
+        been generated. This becomes important if token_id != desired token (refer to else statement in
+        PhrasalConstraint)
+
+        Args:
+            token_id(`int`):
+                The id of a newly generated token in the beam search.
+        Return:
+            stepped(`bool`):
+                Whether this constraint has become one step closer to being fulfuilled.
+            completed(`bool`):
+                Whether this constraint has been completely fulfilled by this token being generated.
+            reset (`bool`):
+                Whether this constraint has reset its progress by this token being generated.
+        """
+        raise NotImplementedError(
+            f"{self.__class__} is an abstract class. Only classes inheriting this class can be called."
+        )
+
+    @abstractmethod
+    def reset(self):
+        """
+        Resets the state of this constraint to its initialization. We would call this in cases where the fulfillment of
+        a constraint is abrupted by an unwanted token.
+        """
+        raise NotImplementedError(
+            f"{self.__class__} is an abstract class. Only classes inheriting this class can be called."
+        )
+
+    @abstractmethod
+    def remaining(self):
+        """
+        Returns the number of remaining steps of `advance()` in order to complete this constraint.
+        """
+        raise NotImplementedError(
+            f"{self.__class__} is an abstract class. Only classes inheriting this class can be called."
+        )
+
+    @abstractmethod
+    def copy(self, stateful=False):
+        """
+        Creates a new instance of this constraint.
+
+        Args:
+            stateful(`bool`): Whether to not only copy the constraint for new instance, but also its state.
+
+        Return:
+            constraint(`Constraint`): The same constraint as the one being called from.
+        """
+        raise NotImplementedError(
+            f"{self.__class__} is an abstract class. Only classes inheriting this class can be called."
+        )
+
+
+class PhrasalConstraint(Constraint):
+    r"""
+    [`Constraint`] enforcing that an ordered sequence of tokens is included in the output.
+
+    Args:
+        token_ids (`List[int]`):
+            The id of the token that must be generated by the output.
+    """
+
+    def __init__(self, token_ids: List[int]):
+        super(Constraint, self).__init__()
+
+        if not isinstance(token_ids, list) or len(token_ids) == 0:
+            raise ValueError(f"`token_ids` has to be a non-empty list, but is {token_ids}.")
+        if any((not isinstance(token_id, int) or token_id < 0) for token_id in token_ids):
+            raise ValueError(f"Each list in `token_ids` has to be a list of positive integers, but is {token_ids}.")
+
+        self.token_ids = token_ids
+
+        self.seqlen = len(self.token_ids)
+        self.fulfilled_idx = -1  # the index of the currently fulfilled step
+        self.completed = False
+
+    def advance(self):
+        if self.completed:
+            return None
+        return self.token_ids[self.fulfilled_idx + 1]
+
+    def does_advance(self, token_id: int):
+        if not isinstance(token_id, int):
+            raise TypeError(f"`token_id` has to be an `int`, but is {token_id} of type {type(token_id)}")
+
+        if self.completed:
+            return False
+
+        return token_id == self.token_ids[self.fulfilled_idx + 1]
+
+    def update(self, token_id: int):
+        if not isinstance(token_id, int):
+            raise TypeError(f"`token_id` has to be an `int`, but is {token_id} of type {type(token_id)}")
+
+        stepped = False
+        completed = False
+        reset = False
+
+        if self.does_advance(token_id):
+            self.fulfilled_idx += 1
+            stepped = True
+            if self.fulfilled_idx == (self.seqlen - 1):
+                completed = True
+            self.completed = completed
+        else:
+            # failed to make progress.
+            reset = True
+            self.reset()
+        return stepped, completed, reset
+
+    def reset(self):
+        self.completed = False
+        self.fulfilled_idx = 0
+
+    def remaining(self):
+        return self.seqlen - (self.fulfilled_idx + 1)
+
+    def copy(self, stateful=False):
+        new_constraint = PhrasalConstraint(self.token_ids)
+
+        if stateful:
+            new_constraint.seq_len = self.seqlen
+            new_constraint.fulfilled_idx = self.fulfilled_idx
+            new_constraint.completed = self.completed
+
+        return new_constraint
+
+
+class DisjunctiveTrie:
+    def __init__(self, nested_token_ids: List[List[int]], no_subsets=True):
+        r"""
+        A helper class that builds a trie with the words represented in `nested_token_ids`.
+        """
+        self.max_height = max([len(one) for one in nested_token_ids])
+
+        root = {}
+        for token_ids in nested_token_ids:
+            level = root
+            for tidx, token_id in enumerate(token_ids):
+                if token_id not in level:
+                    level[token_id] = {}
+
+                level = level[token_id]
+
+        if no_subsets and self.has_subsets(root, nested_token_ids):
+            raise ValueError(
+                "Each list in `nested_token_ids` can't be a complete subset of another list, but is"
+                f" {nested_token_ids}."
+            )
+
+        self.trie = root
+
+    def next_tokens(self, current_seq):
+        """
+        The next possible tokens that will progress the trie, given the current sequence of tokens in `current_seq`.
+        """
+        start = self.trie
+
+        for current_token in current_seq:
+            start = start[current_token]
+
+        next_tokens = list(start.keys())
+
+        return next_tokens
+
+    def reached_leaf(self, current_seq):
+        next_tokens = self.next_tokens(current_seq)
+
+        return len(next_tokens) == 0
+
+    def count_leaves(self, root):
+        next_nodes = list(root.values())
+        if len(next_nodes) == 0:
+            return 1
+        else:
+            return sum([self.count_leaves(nn) for nn in next_nodes])
+
+    def has_subsets(self, trie, nested_token_ids):
+        """
+        Returns whether # of leaves == # of words. Otherwise some word is a subset of another.
+        """
+        leaf_count = self.count_leaves(trie)
+        return len(nested_token_ids) != leaf_count
+
+
+class DisjunctiveConstraint(Constraint):
+    r"""
+    A special [`Constraint`] that is fulfilled by fulfilling just one of several constraints.
+
+    Args:
+        nested_token_ids (`List[List[int]]`):
+            A list of words, where each word is a list of ids. This constraint is fulfilled by generating just one from
+            the list of words.
+    """
+
+    def __init__(self, nested_token_ids: List[List[int]]):
+        super(Constraint, self).__init__()
+
+        if not isinstance(nested_token_ids, list) or len(nested_token_ids) == 0:
+            raise ValueError(f"`nested_token_ids` has to be a non-empty list, but is {nested_token_ids}.")
+        if any(not isinstance(token_ids, list) for token_ids in nested_token_ids):
+            raise ValueError(f"`nested_token_ids` has to be a list of lists, but is {nested_token_ids}.")
+        if any(
+            any((not isinstance(token_id, int) or token_id < 0) for token_id in token_ids)
+            for token_ids in nested_token_ids
+        ):
+            raise ValueError(
+                f"Each list in `nested_token_ids` has to be a list of positive integers, but is {nested_token_ids}."
+            )
+
+        self.trie = DisjunctiveTrie(nested_token_ids)
+        self.token_ids = nested_token_ids
+
+        self.seqlen = self.trie.max_height
+        self.current_seq = []
+        self.completed = False
+
+    def advance(self):
+        token_list = self.trie.next_tokens(self.current_seq)
+
+        if len(token_list) == 0:
+            return None
+        else:
+            return token_list
+
+    def does_advance(self, token_id: int):
+        if not isinstance(token_id, int):
+            raise TypeError(f"`token_id` is supposed to be type `int`, but is {token_id} of type {type(token_id)}")
+
+        next_tokens = self.trie.next_tokens(self.current_seq)
+
+        return token_id in next_tokens
+
+    def update(self, token_id: int):
+        if not isinstance(token_id, int):
+            raise TypeError(f"`token_id` is supposed to be type `int`, but is {token_id} of type {type(token_id)}")
+
+        stepped = False
+        completed = False
+        reset = False
+
+        if self.does_advance(token_id):
+            self.current_seq.append(token_id)
+            stepped = True
+        else:
+            reset = True
+            self.reset()
+
+        completed = self.trie.reached_leaf(self.current_seq)
+        self.completed = completed
+
+        return stepped, completed, reset
+
+    def reset(self):
+        self.completed = False
+        self.current_seq = []
+
+    def remaining(self):
+        if self.completed:
+            # since this can be completed without reaching max height
+            return 0
+        else:
+            return self.seqlen - len(self.current_seq)
+
+    def copy(self, stateful=False):
+        new_constraint = DisjunctiveConstraint(self.token_ids)
+
+        if stateful:
+            new_constraint.seq_len = self.seqlen
+            new_constraint.current_seq = self.current_seq
+            new_constraint.completed = self.completed
+
+        return new_constraint
+
+
+class ConstraintListState:
+    r"""
+    A class for beam scorers to track its progress through a list of constraints.
+
+    Args:
+        constraints (`List[Constraint]`):
+            A list of [`Constraint`] objects that must be fulfilled by the beam scorer.
+    """
+
+    def __init__(self, constraints: List[Constraint]):
+        self.constraints = constraints
+
+        # max # of steps required to fulfill a given constraint
+        self.max_seqlen = max([c.seqlen for c in constraints])
+        self.n_constraints = len(constraints)
+        self.completed = False
+
+        self.init_state()
+
+    def init_state(self):
+        self.complete_constraints = []
+        self.inprogress_constraint = None
+        self.pending_constraints = [constraint.copy(stateful=False) for constraint in self.constraints]
+
+    def get_bank(self):
+        add = 0
+        if self.inprogress_constraint:
+            # extra points for having a constraint mid-fulfilled
+            add += self.max_seqlen - self.inprogress_constraint.remaining()
+
+        return (len(self.complete_constraints) * self.max_seqlen) + add
+
+    def advance(self):
+        """The list of tokens to generate such that we can make progress.
+        By "list" we don't mean the list of token that will fully fulfill a constraint.
+
+        Given constraints `c_i = {t_ij | j == # of tokens}`, If we're not in the middle of progressing through a
+        specific constraint `c_i`, we return:
+
+        `[t_k1 for k in indices of unfulfilled constraints]`
+
+        If we are in the middle of a constraint, then we return:
+            `[t_ij]`, where `i` is the index of the inprogress constraint, `j` is the next step for the constraint.
+
+        Though we don't care which constraint is fulfilled first, if we are in the progress of fulfilling a constraint,
+        that's the only one we'll return.
+        """
+        token_list = []
+        if self.inprogress_constraint is None:
+            for constraint in self.pending_constraints:  # "pending" == "unfulfilled yet"
+                advance = constraint.advance()
+                if isinstance(advance, int):
+                    token_list.append(advance)
+                elif isinstance(advance, list):
+                    token_list.extend(advance)
+        else:
+            advance = self.inprogress_constraint.advance()
+            if isinstance(advance, int):
+                token_list.append(advance)
+            elif isinstance(advance, list):
+                token_list.extend(advance)
+
+        if len(token_list) == 0:
+            return None
+        else:
+            return token_list
+
+    def reset(self, token_ids: Optional[List[int]]):
+        """
+        token_ids: the tokens generated thus far to reset the state of the progress through constraints.
+        """
+        self.init_state()
+
+        if token_ids is not None:
+            for token in token_ids:
+                # completes or steps **one** constraint
+                complete, stepped = self.add(token)
+
+                # the entire list of constraints are fulfilled
+                if self.completed:
+                    break
+
+    def add(self, token_id: int):
+        if not isinstance(token_id, int):
+            raise TypeError(f"`token_id` should be an `int`, but is `{token_id}`.")
+
+        complete, stepped = False, False
+
+        if self.completed:
+            complete = True
+            stepped = False
+            return complete, stepped
+
+        if self.inprogress_constraint is not None:
+            # In the middle of fulfilling a constraint. If the `token_id` *does* makes an incremental progress to current
+            # job, simply update the state
+
+            stepped, complete, reset = self.inprogress_constraint.update(token_id)
+            if reset:
+                # 1. If the next token breaks the progress, then we must restart.
+                #     e.g. constraint = "I love pies" and sequence so far is "I love" but `token_id` == "books".
+
+                #     But that doesn't mean we self.init_state(), since we only reset the state for this particular
+                #     constraint, not the full list of constraints.
+
+                self.pending_constraints.append(self.inprogress_constraint.copy(stateful=False))
+                self.inprogress_constraint = None
+
+            if complete:
+                # 2. If the next token completes the constraint, move it to completed list, set
+                #     inprogress to None. If there are no pending constraints either, then this full list of constraints
+                #     is complete.
+
+                self.complete_constraints.append(self.inprogress_constraint)
+                self.inprogress_constraint = None
+
+                if len(self.pending_constraints) == 0:
+                    # we're done!
+                    self.completed = True
+
+        else:
+            # Not in the middle of fulfilling a constraint. So does this `token_id` helps us step towards any of our list
+            # of constraints?
+
+            for cidx, pending_constraint in enumerate(self.pending_constraints):
+                if pending_constraint.does_advance(token_id):
+                    stepped, complete, reset = pending_constraint.update(token_id)
+
+                    if not stepped:
+                        raise Exception(
+                            "`constraint.update(token_id)` is not yielding incremental progress, "
+                            "even though `constraint.does_advance(token_id)` is true."
+                        )
+
+                    if complete:
+                        self.complete_constraints.append(pending_constraint)
+                        self.inprogress_constraint = None
+
+                    if not complete and stepped:
+                        self.inprogress_constraint = pending_constraint
+
+                    if complete or stepped:
+                        # If we made any progress at all, then it's at least not a "pending constraint".
+
+                        self.pending_constraints = (
+                            self.pending_constraints[:cidx] + self.pending_constraints[cidx + 1 :]
+                        )
+
+                        if len(self.pending_constraints) == 0 and self.inprogress_constraint is None:
+                            # If there's no longer any pending after this and no inprogress either, then we must be
+                            # complete.
+
+                            self.completed = True
+
+                        break  # prevent accidentally stepping through multiple constraints with just one token.
+
+        return complete, stepped
+
+    def copy(self, stateful=True):
+        new_state = ConstraintListState(self.constraints)  # we actually never though self.constraints objects
+        # throughout this process. So it's at initialization state.
+
+        if stateful:
+            new_state.complete_constraints = [
+                constraint.copy(stateful=True) for constraint in self.complete_constraints
+            ]
+            if self.inprogress_constraint is not None:
+                new_state.inprogress_constraint = self.inprogress_constraint.copy(stateful=True)
+            new_state.pending_constraints = [constraint.copy() for constraint in self.pending_constraints]
+
+        return new_state
diff --git a/.venv/lib/python3.11/site-packages/transformers/generation/beam_search.py b/.venv/lib/python3.11/site-packages/transformers/generation/beam_search.py
new file mode 100644
index 0000000000000000000000000000000000000000..7e4f13ad1e15c64718f153c740b827ee5f2b6a65
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/generation/beam_search.py
@@ -0,0 +1,1013 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Inc. team
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from abc import ABC, abstractmethod
+from collections import UserDict
+from typing import Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+
+from ..utils import add_start_docstrings
+from .beam_constraints import Constraint, ConstraintListState
+
+
+PROCESS_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size * num_beams, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using any class inheriting from [`PreTrainedTokenizer`]. See
+            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        next_scores (`torch.FloatTensor` of shape `(batch_size, 2 * num_beams)`):
+            Current scores of the top `2 * num_beams` non-finished beam hypotheses.
+        next_tokens (`torch.LongTensor` of shape `(batch_size, 2 * num_beams)`):
+            `input_ids` of the tokens corresponding to the top `2 * num_beams` non-finished beam hypotheses.
+        next_indices (`torch.LongTensor` of shape `(batch_size, 2 * num_beams)`):
+            Beam indices indicating to which beam hypothesis the `next_tokens` correspond.
+        pad_token_id (`int`, *optional*):
+            The id of the *padding* token.
+        eos_token_id (`Union[int, List[int]]`, *optional*):
+            The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
+        beam_indices (`torch.LongTensor`, *optional*):
+            Beam indices indicating to which beam hypothesis each token correspond.
+        group_index (`int`, *optional*):
+            The index of the group of beams. Used with [`~PreTrainedModel.group_beam_search`].
+
+    Return:
+        `UserDict`: A dictionary composed of the fields as defined above:
+
+            - **next_beam_scores** (`torch.FloatTensor` of shape `(batch_size * num_beams)`) -- Updated scores of all
+              non-finished beams.
+            - **next_beam_tokens** (`torch.FloatTensor` of shape `(batch_size * num_beams)`) -- Next tokens to be added
+              to the non-finished beam_hypotheses.
+            - **next_beam_indices** (`torch.FloatTensor` of shape `(batch_size * num_beams)`) -- Beam indices
+              indicating to which beam the next tokens shall be added.
+
+"""
+
+FINALIZE_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size * num_beams, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using any class inheriting from [`PreTrainedTokenizer`]. See
+            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        final_beam_scores (`torch.FloatTensor` of shape `(batch_size * num_beams)`):
+            The final scores of all non-finished beams.
+        final_beam_tokens (`torch.FloatTensor` of shape `(batch_size * num_beams)`):
+            The last tokens to be added to the non-finished beam_hypotheses.
+        final_beam_indices (`torch.FloatTensor` of shape `(batch_size * num_beams)`):
+            The beam indices indicating to which beam the `final_beam_tokens` shall be added.
+        pad_token_id (`int`, *optional*):
+            The id of the *padding* token.
+        eos_token_id (`Union[int, List[int]]`, *optional*):
+            The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
+
+    Return:
+        `torch.LongTensor` of shape `(batch_size * num_return_sequences, sequence_length)`: The generated sequences.
+        The second dimension (sequence_length) is either equal to `max_length` or shorter if all batches finished early
+        due to the `eos_token_id`.
+
+"""
+
+
+class BeamScorer(ABC):
+    """
+    Abstract base class for all beam scorers that are used for [`~PreTrainedModel.beam_search`] and
+    [`~PreTrainedModel.beam_sample`].
+    """
+
+    @abstractmethod
+    @add_start_docstrings(PROCESS_INPUTS_DOCSTRING)
+    def process(
+        self,
+        input_ids: torch.LongTensor,
+        next_scores: torch.FloatTensor,
+        next_tokens: torch.LongTensor,
+        next_indices: torch.LongTensor,
+        **kwargs,
+    ) -> Tuple[torch.Tensor]:
+        raise NotImplementedError("This is an abstract method.")
+
+    @abstractmethod
+    @add_start_docstrings(FINALIZE_INPUTS_DOCSTRING)
+    def finalize(
+        self,
+        input_ids: torch.LongTensor,
+        next_scores: torch.FloatTensor,
+        next_tokens: torch.LongTensor,
+        next_indices: torch.LongTensor,
+        max_length: int,
+        **kwargs,
+    ) -> torch.LongTensor:
+        raise NotImplementedError("This is an abstract method.")
+
+
+class BeamSearchScorer(BeamScorer):
+    r"""
+    [`BeamScorer`] implementing standard beam search decoding.
+
+    Adapted in part from [Facebook's XLM beam search
+    code](https://github.com/facebookresearch/XLM/blob/9e6f6814d17be4fe5b15f2e6c43eb2b2d76daeb4/src/model/transformer.py#L529).
+
+    Reference for the diverse beam search algorithm and implementation [Ashwin Kalyan's DBS
+    implementation](https://github.com/ashwinkalyan/dbs/blob/master/dbs/beam_utils.lua)
+
+    Args:
+        batch_size (`int`):
+            Batch Size of `input_ids` for which standard beam search decoding is run in parallel.
+        num_beams (`int`):
+            Number of beams for beam search.
+        device (`torch.device`):
+            Defines the device type (*e.g.*, `"cpu"` or `"cuda"`) on which this instance of `BeamSearchScorer` will be
+            allocated.
+        length_penalty (`float`, *optional*, defaults to 1.0):
+            Exponential penalty to the length that is used with beam-based generation. It is applied as an exponent to
+            the sequence length, which in turn is used to divide the score of the sequence. Since the score is the log
+            likelihood of the sequence (i.e. negative), `length_penalty` > 0.0 promotes longer sequences, while
+            `length_penalty` < 0.0 encourages shorter sequences.
+        do_early_stopping (`bool` or `str`, *optional*, defaults to `False`):
+            Controls the stopping condition for beam-based methods, like beam-search. It accepts the following values:
+            `True`, where the generation stops as soon as there are `num_beams` complete candidates; `False`, where an
+            heuristic is applied and the generation stops when is it very unlikely to find better candidates;
+            `"never"`, where the beam search procedure only stops when there cannot be better candidates (canonical
+            beam search algorithm).
+        num_beam_hyps_to_keep (`int`, *optional*, defaults to 1):
+            The number of beam hypotheses that shall be returned upon calling
+            [`~transformers.BeamSearchScorer.finalize`].
+        num_beam_groups (`int`, *optional*, defaults to 1):
+            Number of groups to divide `num_beams` into in order to ensure diversity among different groups of beams.
+            See [this paper](https://arxiv.org/pdf/1610.02424.pdf) for more details.
+        max_length (`int`, *optional*):
+            The maximum length of the sequence to be generated.
+    """
+
+    def __init__(
+        self,
+        batch_size: int,
+        num_beams: int,
+        device: torch.device,
+        length_penalty: Optional[float] = 1.0,
+        do_early_stopping: Optional[Union[bool, str]] = False,
+        num_beam_hyps_to_keep: Optional[int] = 1,
+        num_beam_groups: Optional[int] = 1,
+        max_length: Optional[int] = None,
+    ):
+        self.num_beams = num_beams
+        self.device = device
+        self.length_penalty = length_penalty
+        self.do_early_stopping = do_early_stopping
+        self.num_beam_hyps_to_keep = num_beam_hyps_to_keep
+        self.num_beam_groups = num_beam_groups
+        self.group_size = self.num_beams // self.num_beam_groups
+
+        self._is_init = False
+        # self._beam_hyps[i*self.num_beam_groups+j] is the beam_hyps of the j-th group in the i-th mini-batch.
+        # If group_beam_search is not used, the list consists of `batch_size` beam_hyps.
+        self._beam_hyps = [
+            BeamHypotheses(
+                num_beams=self.group_size,
+                length_penalty=self.length_penalty,
+                early_stopping=self.do_early_stopping,
+                max_length=max_length,
+            )
+            for _ in range(batch_size * self.num_beam_groups)
+        ]
+        # self._done[i*self.num_beam_groups+j] indicates whether the generation of the beam_hyps of the j-th group
+        # in the i-th mini-batch is complete.
+        self._done = torch.tensor(
+            [False for _ in range(batch_size * self.num_beam_groups)], dtype=torch.bool, device=self.device
+        )
+
+        if not isinstance(num_beams, int) or num_beams <= 1:
+            raise ValueError(
+                f"`num_beams` has to be an integer strictly greater than 1, but is {num_beams}. For `num_beams` == 1,"
+                " one should make use of `greedy_search` instead."
+            )
+
+        if not isinstance(num_beam_groups, int) or (num_beam_groups > num_beams) or (num_beams % num_beam_groups != 0):
+            raise ValueError(
+                "`num_beam_groups` has to be an integer smaller or equal than `num_beams` and `num_beams` has to be"
+                f" divisible by `num_beam_groups`, but is {num_beam_groups} with `num_beams` being {num_beams}."
+            )
+
+    @property
+    def is_done(self) -> bool:
+        return self._done.all()
+
+    def process(
+        self,
+        input_ids: torch.LongTensor,
+        next_scores: torch.FloatTensor,
+        next_tokens: torch.LongTensor,
+        next_indices: torch.LongTensor,
+        pad_token_id: Optional[Union[int, torch.Tensor]] = None,
+        eos_token_id: Optional[Union[int, List[int], torch.Tensor]] = None,
+        beam_indices: Optional[torch.LongTensor] = None,
+        group_index: Optional[int] = 0,
+        decoder_prompt_len: Optional[int] = 0,
+    ) -> Dict[str, torch.Tensor]:
+        # add up to the length which the next_scores is calculated on (including decoder prompt)
+        cur_len = input_ids.shape[-1] + 1
+        batch_size = len(self._beam_hyps) // self.num_beam_groups
+
+        if not (batch_size == (input_ids.shape[0] // self.group_size)):
+            if self.num_beam_groups > 1:
+                raise ValueError(
+                    f"A group beam size of {input_ids.shape[0]} is used as the input, but a group beam "
+                    f"size of {self.group_size} is expected by the beam scorer."
+                )
+            else:
+                raise ValueError(
+                    f"A beam size of {input_ids.shape[0]} is used as the input, but a beam size of "
+                    f"{self.group_size} is expected by the beam scorer."
+                )
+
+        device = input_ids.device
+        next_beam_scores = torch.zeros((batch_size, self.group_size), dtype=next_scores.dtype, device=device)
+        next_beam_tokens = torch.zeros((batch_size, self.group_size), dtype=next_tokens.dtype, device=device)
+        next_beam_indices = torch.zeros((batch_size, self.group_size), dtype=next_indices.dtype, device=device)
+
+        if eos_token_id is not None and not isinstance(eos_token_id, torch.Tensor):
+            if isinstance(eos_token_id, int):
+                eos_token_id = [eos_token_id]
+            eos_token_id = torch.tensor(eos_token_id)
+
+        for batch_idx in range(batch_size):
+            batch_group_idx = batch_idx * self.num_beam_groups + group_index
+            if self._done[batch_group_idx]:
+                if self.num_beams < len(self._beam_hyps[batch_group_idx]):
+                    raise ValueError(f"Batch can only be done if at least {self.num_beams} beams have been generated")
+                if eos_token_id is None or pad_token_id is None:
+                    raise ValueError("Generated beams >= num_beams -> eos_token_id and pad_token have to be defined")
+                # pad the batch
+                next_beam_scores[batch_idx, :] = 0
+                next_beam_tokens[batch_idx, :] = pad_token_id
+                next_beam_indices[batch_idx, :] = 0
+                continue
+
+            # next tokens for this sentence
+            beam_idx = 0
+            for beam_token_rank, (next_token, next_score, next_index) in enumerate(
+                zip(next_tokens[batch_idx], next_scores[batch_idx], next_indices[batch_idx])
+            ):
+                batch_beam_idx = batch_idx * self.group_size + next_index
+                # add to generated hypotheses if end of sentence
+                if (eos_token_id is not None) and (next_token.item() in eos_token_id):
+                    # if beam_token does not belong to top num_beams tokens, it should not be added
+                    is_beam_token_worse_than_top_num_beams = beam_token_rank >= self.group_size
+                    if is_beam_token_worse_than_top_num_beams:
+                        continue
+                    if beam_indices is not None:
+                        beam_index = beam_indices[batch_beam_idx]
+                        beam_index = beam_index + (batch_beam_idx,)
+                    else:
+                        beam_index = None
+
+                    self._beam_hyps[batch_group_idx].add(
+                        input_ids[batch_beam_idx].clone(),
+                        next_score.item(),
+                        beam_indices=beam_index,
+                        generated_len=cur_len - decoder_prompt_len,
+                    )
+                else:
+                    # add next predicted token since it is not eos_token
+                    next_beam_scores[batch_idx, beam_idx] = next_score
+                    next_beam_tokens[batch_idx, beam_idx] = next_token
+                    next_beam_indices[batch_idx, beam_idx] = batch_beam_idx
+                    beam_idx += 1
+
+                # once the beam for next step is full, don't add more tokens to it.
+                if beam_idx == self.group_size:
+                    break
+
+            if beam_idx < self.group_size:
+                raise ValueError(
+                    f"At most {self.group_size} tokens in {next_tokens[batch_idx]} can be equal to `eos_token_id:"
+                    f" {eos_token_id}`. Make sure {next_tokens[batch_idx]} are corrected."
+                )
+
+            # Check if we are done so that we can save a pad step if all(done)
+            self._done[batch_group_idx] = self._done[batch_group_idx] or self._beam_hyps[batch_group_idx].is_done(
+                next_scores[batch_idx].max().item(), cur_len, decoder_prompt_len
+            )
+
+        return UserDict(
+            {
+                "next_beam_scores": next_beam_scores.view(-1),
+                "next_beam_tokens": next_beam_tokens.view(-1),
+                "next_beam_indices": next_beam_indices.view(-1),
+            }
+        )
+
+    def finalize(
+        self,
+        input_ids: torch.LongTensor,
+        final_beam_scores: torch.FloatTensor,
+        final_beam_tokens: torch.LongTensor,
+        final_beam_indices: torch.LongTensor,
+        max_length: int,
+        pad_token_id: Optional[Union[int, torch.Tensor]] = None,
+        eos_token_id: Optional[Union[int, List[int], torch.Tensor]] = None,
+        beam_indices: Optional[torch.LongTensor] = None,
+        decoder_prompt_len: Optional[int] = 0,
+    ) -> Tuple[torch.LongTensor]:
+        batch_size = len(self._beam_hyps) // self.num_beam_groups
+
+        if eos_token_id is not None and not isinstance(eos_token_id, torch.Tensor):
+            if isinstance(eos_token_id, int):
+                eos_token_id = [eos_token_id]
+            eos_token_id = torch.tensor(eos_token_id)
+
+        # finalize all open beam hypotheses and add to generated hypotheses
+        for batch_group_idx, beam_hyp in enumerate(self._beam_hyps):
+            if self._done[batch_group_idx]:
+                continue
+
+            # all open beam hypotheses are added to the beam hypothesis
+            # beam hypothesis class automatically keeps the best beams
+            for index_per_group in range(self.group_size):
+                batch_beam_idx = batch_group_idx * self.group_size + index_per_group
+                final_score = final_beam_scores[batch_beam_idx].item()
+                final_tokens = input_ids[batch_beam_idx]
+                beam_index = beam_indices[batch_beam_idx] if beam_indices is not None else None
+                generated_len = final_tokens.shape[-1] - decoder_prompt_len
+                beam_hyp.add(final_tokens, final_score, beam_indices=beam_index, generated_len=generated_len)
+
+        # select the best hypotheses
+        sent_lengths = input_ids.new(batch_size * self.num_beam_hyps_to_keep)
+        best = []
+        best_indices = []
+        best_scores = torch.zeros(batch_size * self.num_beam_hyps_to_keep, device=self.device, dtype=torch.float32)
+
+        # retrieve best hypotheses
+        for i in range(batch_size):
+            beam_hyps_in_batch = self._beam_hyps[i * self.num_beam_groups : (i + 1) * self.num_beam_groups]
+            candidate_beams = [beam for beam_hyp in beam_hyps_in_batch for beam in beam_hyp.beams]
+            sorted_hyps = sorted(candidate_beams, key=lambda x: x[0])
+            for j in range(self.num_beam_hyps_to_keep):
+                best_hyp_tuple = sorted_hyps.pop()
+                best_score = best_hyp_tuple[0]
+                best_hyp = best_hyp_tuple[1]
+                best_index = best_hyp_tuple[2]
+                sent_lengths[self.num_beam_hyps_to_keep * i + j] = len(best_hyp)
+
+                # append hyp to lists
+                best.append(best_hyp)
+
+                # append indices to list
+                best_indices.append(best_index)
+
+                best_scores[i * self.num_beam_hyps_to_keep + j] = best_score
+
+        # prepare for adding eos
+        sent_lengths_max = sent_lengths.max().item() + 1
+        sent_max_len = min(sent_lengths_max, max_length) if max_length is not None else sent_lengths_max
+        decoded: torch.LongTensor = input_ids.new(batch_size * self.num_beam_hyps_to_keep, sent_max_len)
+
+        if len(best_indices) > 0 and best_indices[0] is not None:
+            indices: torch.LongTensor = input_ids.new(batch_size * self.num_beam_hyps_to_keep, sent_max_len)
+        else:
+            indices = None
+
+        # shorter batches are padded if needed
+        if sent_lengths.min().item() != sent_lengths.max().item():
+            if pad_token_id is None:
+                raise ValueError("`pad_token_id` has to be defined")
+            decoded.fill_(pad_token_id)
+
+        if indices is not None:
+            indices.fill_(-1)
+
+        # fill with hypotheses and eos_token_id if the latter fits in
+        for i, (hypo, best_idx) in enumerate(zip(best, best_indices)):
+            decoded[i, : sent_lengths[i]] = hypo
+
+            if indices is not None:
+                indices[i, : len(best_idx)] = torch.tensor(best_idx)
+
+            if sent_lengths[i] < sent_max_len:
+                # inserting only the first eos_token_id
+                decoded[i, sent_lengths[i]] = eos_token_id[0]
+
+        return UserDict(
+            {
+                "sequences": decoded,
+                "sequence_scores": best_scores,
+                "beam_indices": indices,
+            }
+        )
+
+
+class ConstrainedBeamSearchScorer(BeamScorer):
+    r"""
+    [`BeamScorer`] implementing constrained beam search decoding.
+
+
+    Args:
+        batch_size (`int`):
+            Batch Size of `input_ids` for which standard beam search decoding is run in parallel.
+        num_beams (`int`):
+            Number of beams for beam search.
+        constraints (`List[Constraint]`):
+            A list of positive constraints represented as `Constraint` objects that must be fulfilled in the generation
+            output. For more information, the documentation of [`Constraint`] should be read.
+        device (`torch.device`):
+            Defines the device type (*e.g.*, `"cpu"` or `"cuda"`) on which this instance of `BeamSearchScorer` will be
+            allocated.
+        length_penalty (`float`, *optional*, defaults to 1.0):
+            Exponential penalty to the length that is used with beam-based generation. It is applied as an exponent to
+            the sequence length, which in turn is used to divide the score of the sequence. Since the score is the log
+            likelihood of the sequence (i.e. negative), `length_penalty` > 0.0 promotes longer sequences, while
+            `length_penalty` < 0.0 encourages shorter sequences.
+        do_early_stopping (`bool` or `str`, *optional*, defaults to `False`):
+            Controls the stopping condition for beam-based methods, like beam-search. It accepts the following values:
+            `True`, where the generation stops as soon as there are `num_beams` complete candidates; `False`, where an
+            heuristic is applied and the generation stops when is it very unlikely to find better candidates;
+            `"never"`, where the beam search procedure only stops when there cannot be better candidates (canonical
+            beam search algorithm).
+        num_beam_hyps_to_keep (`int`, *optional*, defaults to 1):
+            The number of beam hypotheses that shall be returned upon calling
+            [`~transformers.BeamSearchScorer.finalize`].
+        num_beam_groups (`int`, *optional*, defaults to 1):
+            Number of groups to divide `num_beams` into in order to ensure diversity among different groups of beams.
+            See [this paper](https://arxiv.org/pdf/1610.02424.pdf) for more details.
+        max_length (`int`, *optional*):
+            The maximum length of the sequence to be generated.
+    """
+
+    def __init__(
+        self,
+        batch_size: int,
+        num_beams: int,
+        constraints: List[Constraint],
+        device: torch.device,
+        length_penalty: Optional[float] = 1.0,
+        do_early_stopping: Optional[Union[bool, str]] = False,
+        num_beam_hyps_to_keep: Optional[int] = 1,
+        num_beam_groups: Optional[int] = 1,
+        max_length: Optional[int] = None,
+    ):
+        self.num_beams = num_beams
+        self.device = device
+        self.length_penalty = length_penalty
+        self.do_early_stopping = do_early_stopping
+        self.num_beam_hyps_to_keep = num_beam_hyps_to_keep
+        self.num_beam_groups = num_beam_groups
+        self.group_size = self.num_beams // self.num_beam_groups
+        self.constraints = constraints
+
+        self._is_init = False
+        self._beam_hyps = [
+            BeamHypotheses(
+                num_beams=self.num_beams,
+                length_penalty=self.length_penalty,
+                early_stopping=self.do_early_stopping,
+                max_length=max_length,
+            )
+            for _ in range(batch_size)
+        ]
+        self._done = torch.tensor([False for _ in range(batch_size)], dtype=torch.bool, device=self.device)
+
+        if not isinstance(num_beams, int) or num_beams <= 1:
+            raise ValueError(
+                f"`num_beams` has to be an integer strictly greater than 1, but is {num_beams}. For `num_beams` == 1,"
+                " one should make use of `greedy_search` instead."
+            )
+
+        if not isinstance(num_beam_groups, int) or (num_beam_groups > num_beams) or (num_beams % num_beam_groups != 0):
+            raise ValueError(
+                "`num_beam_groups` has to be an integer smaller or equal than `num_beams` and `num_beams` has to be"
+                f" divisible by `num_beam_groups`, but is {num_beam_groups} with `num_beams` being {num_beams}."
+            )
+
+    @property
+    def is_done(self) -> bool:
+        return self._done.all()
+
+    def make_constraint_states(self, n):
+        return [ConstraintListState([constraint.copy() for constraint in self.constraints]) for _ in range(n)]
+
+    def check_completes_constraints(self, sequence):
+        new_state = self.make_constraint_states(1)[0]
+        new_state.reset(sequence)
+        return new_state.completed
+
+    def process(
+        self,
+        input_ids: torch.LongTensor,
+        next_scores: torch.FloatTensor,
+        next_tokens: torch.LongTensor,
+        next_indices: torch.LongTensor,
+        scores_for_all_vocab: torch.FloatTensor,
+        pad_token_id: Optional[Union[int, torch.Tensor]] = None,
+        eos_token_id: Optional[Union[int, List[int], torch.Tensor]] = None,
+        beam_indices: Optional[torch.LongTensor] = None,
+        decoder_prompt_len: Optional[int] = 0,
+    ) -> Tuple[torch.Tensor]:
+        r"""
+        Args:
+            input_ids (`torch.LongTensor` of shape `(batch_size * num_beams, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary.
+
+                Indices can be obtained using any class inheriting from [`PreTrainedTokenizer`]. See
+                [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details.
+
+                [What are input IDs?](../glossary#input-ids)
+            next_scores (`torch.FloatTensor` of shape `(batch_size, 2 * num_beams)`):
+                Current scores of the top `2 * num_beams` non-finished beam hypotheses.
+            next_tokens (`torch.LongTensor` of shape `(batch_size, 2 * num_beams)`):
+                `input_ids` of the tokens corresponding to the top `2 * num_beams` non-finished beam hypotheses.
+            next_indices (`torch.LongTensor` of shape `(batch_size, 2 * num_beams)`):
+                Beam indices indicating to which beam hypothesis the `next_tokens` correspond.
+            scores_for_all_vocab (`torch.FloatTensor` of shape `(batch_size * num_beams, sequence_length)`):
+                The scores of all tokens in the vocabulary for each of the beam hypotheses.
+            pad_token_id (`int`, *optional*):
+                The id of the *padding* token.
+            eos_token_id (`Union[int, List[int]]`, *optional*):
+                The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
+            beam_indices (`torch.LongTensor`, *optional*):
+                Beam indices indicating to which beam hypothesis each token correspond.
+            decoder_prompt_len (`int`, *optional*):
+                The length of prompt that is included in the input to decoder.
+        Return:
+            `UserDict`: A dictionary composed of the fields as defined above:
+
+                - **next_beam_scores** (`torch.FloatTensor` of shape `(batch_size * num_beams)`) -- Updated scores of
+                  all
+                non-finished beams.
+
+                - **next_beam_tokens** (`torch.FloatTensor` of shape `(batch_size * num_beams)`) -- Next tokens to be
+                  added
+                to the non-finished beam_hypotheses.
+                - **next_beam_indices** (`torch.FloatTensor` of shape `(batch_size * num_beams)`) -- Beam indices
+                indicating to which beam the next tokens shall be added.
+        """
+
+        # add up to the length which the next_scores is calculated on (including decoder prompt)
+        cur_len = input_ids.shape[-1] + 1
+        batch_size = len(self._beam_hyps)
+        if not (batch_size == (input_ids.shape[0] // self.group_size)):
+            if self.num_beam_groups > 1:
+                raise ValueError(
+                    f"A group beam size of {input_ids.shape[0]} is used as the input, but a group beam "
+                    f"size of {self.group_size} is expected by the beam scorer."
+                )
+            else:
+                raise ValueError(
+                    f"A beam size of {input_ids.shape[0]} is used as the input, but a beam size of "
+                    f"{self.group_size} is expected by the beam scorer."
+                )
+
+        device = input_ids.device
+
+        next_beam_scores = torch.zeros((batch_size, self.group_size), dtype=next_scores.dtype, device=device)
+        next_beam_tokens = torch.zeros((batch_size, self.group_size), dtype=next_tokens.dtype, device=device)
+        next_beam_indices = torch.zeros((batch_size, self.group_size), dtype=next_indices.dtype, device=device)
+
+        if eos_token_id is not None and not isinstance(eos_token_id, torch.Tensor):
+            if isinstance(eos_token_id, int):
+                eos_token_id = [eos_token_id]
+            eos_token_id = torch.tensor(eos_token_id)
+
+        for batch_idx, beam_hyp in enumerate(self._beam_hyps):
+            if self._done[batch_idx]:
+                if self.num_beams < len(beam_hyp):
+                    raise ValueError(f"Batch can only be done if at least {self.num_beams} beams have been generated")
+                if eos_token_id is None or pad_token_id is None:
+                    raise ValueError("Generated beams >= num_beams -> eos_token_id and pad_token have to be defined")
+                # pad the batch
+                next_beam_scores[batch_idx, :] = 0
+                next_beam_tokens[batch_idx, :] = pad_token_id
+                next_beam_indices[batch_idx, :] = 0
+                continue
+
+            # next tokens for this sentence.
+            beam_idx = 0
+            for beam_token_rank, (next_token, next_score, next_index) in enumerate(
+                zip(next_tokens[batch_idx], next_scores[batch_idx], next_indices[batch_idx])
+            ):
+                batch_beam_idx = batch_idx * self.group_size + next_index
+                # add to generated hypotheses if end of sentence
+                if (eos_token_id is not None) and (next_token.item() in eos_token_id):
+                    # if beam_token does not belong to top num_beams tokens, it should not be added
+                    is_beam_token_worse_than_top_num_beams = beam_token_rank >= self.group_size
+                    if is_beam_token_worse_than_top_num_beams:
+                        continue
+
+                    completes_constraint = self.check_completes_constraints(input_ids[batch_beam_idx].cpu().tolist())
+                    if completes_constraint:
+                        if beam_indices is not None:
+                            beam_index = beam_indices[batch_beam_idx]
+                            beam_index = beam_index + (batch_beam_idx,)
+                        else:
+                            beam_index = None
+
+                        beam_hyp.add(
+                            input_ids[batch_beam_idx].clone(),
+                            next_score.item(),
+                            beam_indices=beam_index,
+                            generated_len=cur_len - decoder_prompt_len,
+                        )
+                else:
+                    # add next predicted token since it is not eos_token
+                    next_beam_scores[batch_idx, beam_idx] = next_score
+                    next_beam_tokens[batch_idx, beam_idx] = next_token
+                    next_beam_indices[batch_idx, beam_idx] = batch_beam_idx
+                    beam_idx += 1
+
+                # once the beam for next step is full, don't add more tokens to it.
+                if beam_idx == self.group_size:
+                    break
+
+            new_scores, new_tokens, new_indices = self.step_sentence_constraint(
+                batch_idx,
+                input_ids,
+                scores_for_all_vocab,
+                next_beam_scores[batch_idx],
+                next_beam_tokens[batch_idx],
+                next_beam_indices[batch_idx],
+            )
+
+            next_beam_scores[batch_idx] = new_scores
+            next_beam_tokens[batch_idx] = new_tokens
+            next_beam_indices[batch_idx] = new_indices
+
+            if beam_idx < self.group_size:
+                raise ValueError(
+                    f"At most {self.group_size} tokens in {next_tokens[batch_idx]} can be equal to `eos_token_id:"
+                    f" {eos_token_id}`. Make sure {next_tokens[batch_idx]} are corrected."
+                )
+
+            # Check if we are done so that we can save a pad step if all(done)
+            self._done[batch_idx] = self._done[batch_idx] or beam_hyp.is_done(
+                next_scores[batch_idx].max().item(), cur_len, decoder_prompt_len
+            )
+
+        return UserDict(
+            {
+                "next_beam_scores": next_beam_scores.view(-1),
+                "next_beam_tokens": next_beam_tokens.view(-1),
+                "next_beam_indices": next_beam_indices.view(-1),
+            }
+        )
+
+    def step_sentence_constraint(
+        self,
+        batch_idx: int,
+        input_ids: torch.LongTensor,
+        vocab_scores: torch.FloatTensor,
+        sent_beam_scores: torch.FloatTensor,
+        sent_beam_tokens: torch.LongTensor,
+        sent_beam_indices: torch.LongTensor,
+        push_progress: bool = False,
+    ):
+        # sent_beam_tokens are the next {num_beams} number of tokens that are under consideration for this beam
+        # (candidate next tokens)
+
+        # 1. Adding "advance_tokens"
+        #     using ConstraintStateList.advance(), we propose new tokens to be added into this "candidate list" that will
+        #     advance us in fulfilling the constraints.
+
+        # 2. Selecting best candidates such that we end up with highest probable candidates
+        #     that fulfill our constraints.
+
+        orig_len = sent_beam_indices.size(0)
+        device = sent_beam_indices.device
+
+        # initialize states
+        topk_contraint_states = self.make_constraint_states(orig_len)
+        advance_constraint_states = self.make_constraint_states(orig_len)
+
+        sidx, eidx = batch_idx * orig_len, (batch_idx + 1) * orig_len
+        this_batch_input_ids = input_ids[sidx:eidx]
+        this_batch_token_scores = vocab_scores[sidx:eidx]
+        full_hypotheses = torch.cat((input_ids[sent_beam_indices], sent_beam_tokens.unsqueeze(-1)), dim=-1)
+
+        # need to make new hypothesis that advance the constraints
+        track_new = {
+            "new_seqs": full_hypotheses.tolist(),
+            "new_states": [],
+            "new_indices": [],
+            "new_tokens": [],
+            "new_scores": [],
+        }
+        for seq_idx, pre_seq in enumerate(this_batch_input_ids):
+            # pre_seq = ith sequence generated before this step.
+
+            # input_ids -> (topk) generic beam search best model next tokens
+            #           -> (advance) constraints forcing the next token
+            # either way, we need to sort them into "banks" later, so store a "ConstraintListState" for all types of
+            # hypotheses.
+
+            topk_state = topk_contraint_states[seq_idx]
+            topk_state.reset(full_hypotheses[seq_idx].cpu().tolist())
+
+            advance_state = advance_constraint_states[seq_idx]
+            advance_state.reset(pre_seq.cpu().tolist())
+
+            if not advance_state.completed:
+                advance_tokens = torch.LongTensor(advance_state.advance()).to(device)
+                for advance_token in advance_tokens:
+                    # since adding each `advance_token` leads to a different hypothesis, create new state instance.
+                    new_state = advance_state.copy(stateful=True)
+                    new_state.add(advance_token.cpu().tolist())
+
+                    advance_seq = torch.cat((pre_seq, advance_token.unsqueeze(0)), -1).cpu().tolist()
+                    if advance_seq not in track_new["new_seqs"]:
+                        # prevent duplicates, which are basically bound to happen in this process.
+                        track_new["new_seqs"].append(advance_seq)
+                        track_new["new_indices"].append(sidx + seq_idx)  # idx -> global idx across all the batches
+                        track_new["new_tokens"].append(advance_token)
+                        track_new["new_scores"].append(this_batch_token_scores[seq_idx].take(advance_token))
+                        track_new["new_states"].append(new_state)
+            elif push_progress:
+                # Basically, `sent_beam_indices` often chooses very little among `input_ids` the generated sequences that
+                # actually fulfill our constraints. For example, let constraints == ["loves pies"] and
+
+                #     pre_seq_1 = "The child loves pies and" pre_seq_2 = "The child plays in the playground and"
+
+                # Without this step, if `sent_beam_indices` is something like [1,1], then
+                #     1. `pre_seq_1` won't be added to the list of (topk) hypothesis since it's not in the indices and
+                #     2.  it won't be added to the list of (advance) hypothesis since it's completed already. (this is
+                #         the else part of `if constraints_completed[seq_idx]`)
+                #     3. it ends up simply getting removed from consideration.
+
+                # #3 might be fine and actually desired, since it's likely that it's a low-probability output anyways,
+                # especially if it's not in the list of `sent_beam_indices`. But this often leads to lengthened beam
+                # search times, since completed sequences keep getting removed after all this effort for constrained
+                # generation.
+
+                # Here, we basically take `pre_seq_1` and to "push" it into the considered list of hypotheses, by simply
+                # appending the next likely token in the vocabulary and adding it to the list of hypotheses.
+
+                new_score, new_token = torch.max(this_batch_token_scores[seq_idx], 0)  # some next probable token
+                advance_seq = torch.cat((pre_seq, new_token.unsqueeze(0)), -1)
+
+                advance_state = advance_constraint_states[seq_idx]
+
+                advance_seq = advance_seq.cpu().tolist()
+
+                advance_state.reset(advance_seq)
+                if advance_seq not in track_new["new_seqs"]:
+                    # but still don't want to have duplicates
+                    track_new["new_seqs"].append(advance_seq)
+                    track_new["new_indices"].append(seq_idx)
+                    track_new["new_tokens"].append(new_token)
+                    track_new["new_scores"].append(new_score)
+                    track_new["new_states"].append(advance_state)
+
+        if len(track_new["new_indices"]) > 0:
+            new_indices = torch.tensor(track_new["new_indices"]).to(device)
+            new_tokens = torch.stack(track_new["new_tokens"]).to(device)
+            new_scores = torch.stack(track_new["new_scores"]).to(device)
+
+            all_states = topk_contraint_states + track_new["new_states"]
+            all_tokens = torch.cat((sent_beam_tokens, new_tokens), -1)
+            all_scores = torch.cat((sent_beam_scores, new_scores), -1)
+            all_banks = torch.tensor([one.get_bank() for one in all_states]).to(device)
+
+            zipped = all_banks * 100 + all_scores
+            indices = zipped.sort(descending=True).indices
+            sorted_banks = all_banks[indices]
+
+            # Then we end up with {sorted among bank C}, {sorted among bank C-1}, ..., {sorted among bank 0}
+
+            counter = -1
+            cur_bank = sorted_banks[0]
+            increments = []
+            for bank in sorted_banks:
+                if bank == cur_bank:
+                    counter += 1
+                else:
+                    counter = 0
+                    cur_bank = bank
+                increments.append(counter)
+            rearrangers = torch.tensor(np.argsort(increments, kind="mergesort"))
+
+            indices = indices[rearrangers][:orig_len]
+
+            sent_beam_scores = all_scores[indices]
+            sent_beam_tokens = all_tokens[indices]
+            sent_beam_indices = torch.cat((sent_beam_indices, new_indices))[indices]
+
+        return sent_beam_scores, sent_beam_tokens, sent_beam_indices
+
+    def finalize(
+        self,
+        input_ids: torch.LongTensor,
+        final_beam_scores: torch.FloatTensor,
+        final_beam_tokens: torch.LongTensor,
+        final_beam_indices: torch.LongTensor,
+        max_length: int,
+        pad_token_id: Optional[Union[int, torch.Tensor]] = None,
+        eos_token_id: Optional[Union[int, List[int], torch.Tensor]] = None,
+        beam_indices: Optional[torch.LongTensor] = None,
+        decoder_prompt_len: Optional[int] = 0,
+    ) -> Tuple[torch.LongTensor]:
+        batch_size = len(self._beam_hyps)
+
+        if eos_token_id is not None and not isinstance(eos_token_id, torch.Tensor):
+            if isinstance(eos_token_id, int):
+                eos_token_id = [eos_token_id]
+            eos_token_id = torch.tensor(eos_token_id)
+
+        # finalize all open beam hypotheses and add to generated hypotheses
+        for batch_idx, beam_hyp in enumerate(self._beam_hyps):
+            if self._done[batch_idx]:
+                continue
+
+            # all open beam hypotheses are added to the beam hypothesis
+            # beam hypothesis class automatically keeps the best beams
+
+            ids_collect = []
+            for beam_id in range(self.num_beams):
+                batch_beam_idx = batch_idx * self.num_beams + beam_id
+                final_score = final_beam_scores[batch_beam_idx].item()
+                final_tokens = input_ids[batch_beam_idx]
+
+                completes_constraint = self.check_completes_constraints(final_tokens.cpu().tolist())
+                if completes_constraint:
+                    beam_index = beam_indices[batch_beam_idx] if beam_indices is not None else None
+                    generated_len = final_tokens.shape[-1] - decoder_prompt_len
+                    beam_hyp.add(final_tokens, final_score, beam_indices=beam_index, generated_len=generated_len)
+                    ids_collect.append(beam_id)
+
+            # due to overly complex constraints or other factors, sometimes we can't gaurantee a successful
+            # generation. In these cases we simply return the highest scoring outputs.
+            if len(ids_collect) < self.num_beam_hyps_to_keep:
+                for beam_id in range(self.num_beams):
+                    if beam_id not in ids_collect:
+                        batch_beam_idx = batch_idx * self.num_beams + beam_id
+                        final_score = final_beam_scores[batch_beam_idx].item()
+                        final_tokens = input_ids[batch_beam_idx]
+                        generated_len = final_tokens.shape[-1] - decoder_prompt_len
+                        beam_hyp.add(final_tokens, final_score, generated_len=generated_len)
+                    if len(ids_collect) >= self.num_beam_hyps_to_keep:
+                        break
+
+        # select the best hypotheses
+        sent_lengths = input_ids.new(batch_size * self.num_beam_hyps_to_keep)
+        best = []
+        best_indices = []
+        best_scores = torch.zeros(batch_size * self.num_beam_hyps_to_keep, device=self.device, dtype=torch.float32)
+
+        # retrieve best hypotheses
+        for i, beam_hyp in enumerate(self._beam_hyps):
+            sorted_hyps = sorted(beam_hyp.beams, key=lambda x: x[0])
+            for j in range(self.num_beam_hyps_to_keep):
+                best_hyp_tuple = sorted_hyps.pop()
+                best_score = best_hyp_tuple[0]
+                best_hyp = best_hyp_tuple[1]
+                best_index = best_hyp_tuple[2]
+                sent_lengths[self.num_beam_hyps_to_keep * i + j] = len(best_hyp)
+
+                # append to lists
+                best.append(best_hyp)
+
+                # append indices to list
+                best_indices.append(best_index)
+
+                best_scores[i * self.num_beam_hyps_to_keep + j] = best_score
+
+        # prepare for adding eos
+        sent_lengths_max = sent_lengths.max().item() + 1
+
+        sent_max_len = min(sent_lengths_max, max_length) if max_length is not None else sent_lengths_max
+        decoded: torch.LongTensor = input_ids.new(batch_size * self.num_beam_hyps_to_keep, sent_max_len)
+
+        if len(best_indices) > 0 and best_indices[0] is not None:
+            indices: torch.LongTensor = input_ids.new(batch_size * self.num_beam_hyps_to_keep, sent_max_len)
+        else:
+            indices = None
+
+        # shorter batches are padded if needed
+        if sent_lengths.min().item() != sent_lengths.max().item():
+            if pad_token_id is None:
+                raise ValueError("`pad_token_id` has to be defined")
+            decoded.fill_(pad_token_id)
+
+        if indices is not None:
+            indices.fill_(-1)
+
+        # fill with hypotheses and eos_token_id if the latter fits in
+        for i, (hypo, best_idx) in enumerate(zip(best, best_indices)):
+            decoded[i, : sent_lengths[i]] = hypo
+
+            if indices is not None:
+                indices[i, : len(best_idx)] = torch.tensor(best_idx)
+
+            if sent_lengths[i] < sent_max_len:
+                # inserting only the first eos_token_id
+                decoded[i, sent_lengths[i]] = eos_token_id[0]
+
+        return UserDict(
+            {
+                "sequences": decoded,
+                "sequence_scores": best_scores,
+                "beam_indices": indices,
+            }
+        )
+
+
+class BeamHypotheses:
+    def __init__(self, num_beams: int, length_penalty: float, early_stopping: bool, max_length: Optional[int] = None):
+        """
+        Initialize n-best list of hypotheses.
+        """
+        self.length_penalty = length_penalty
+        self.early_stopping = early_stopping
+        self.max_length = max_length
+        self.num_beams = num_beams
+        self.beams = []
+        self.worst_score = 1e9
+
+        if not isinstance(self.early_stopping, bool) and self.max_length is None:
+            raise ValueError(
+                "When `do_early_stopping` is set to a string, `max_length` must be defined. Ensure it is passed to the"
+                " BeamScorer class instance at initialization time."
+            )
+
+    def __len__(self):
+        """
+        Number of hypotheses in the list.
+        """
+        return len(self.beams)
+
+    def add(
+        self,
+        hyp: torch.LongTensor,
+        sum_logprobs: float,
+        beam_indices: Optional[torch.LongTensor] = None,
+        generated_len: Optional[int] = None,
+    ):
+        """
+        Add a new hypothesis to the list.
+        """
+        if generated_len is not None:
+            score = sum_logprobs / (generated_len**self.length_penalty)
+        # This 'else' case exists for retrocompatibility
+        else:
+            score = sum_logprobs / (hyp.shape[-1] ** self.length_penalty)
+
+        if len(self) < self.num_beams or score > self.worst_score:
+            self.beams.append((score, hyp, beam_indices))
+            if len(self) > self.num_beams:
+                sorted_next_scores = sorted([(s, idx) for idx, (s, _, _) in enumerate(self.beams)])
+                del self.beams[sorted_next_scores[0][1]]
+                self.worst_score = sorted_next_scores[1][0]
+            else:
+                self.worst_score = min(score, self.worst_score)
+
+    def is_done(self, best_sum_logprobs: float, cur_len: int, decoder_prompt_len: Optional[int] = 0) -> bool:
+        """
+        If there are enough hypotheses and that none of the hypotheses being generated can become better than the worst
+        one in the heap, then we are done with this sentence.
+        """
+
+        if len(self) < self.num_beams:
+            return False
+
+        # `True`: stop as soon as at least `num_beams` hypotheses are finished
+        if self.early_stopping is True:
+            return True
+        # `False`: heuristic -- compute best possible score from `cur_len`, even though it is not entirely accurate
+        #  when `length_penalty` is positive. See the discussion below for more details.
+        # https://github.com/huggingface/transformers/pull/20901#issuecomment-1369845565
+        elif self.early_stopping is False:
+            highest_attainable_score = best_sum_logprobs / (cur_len - decoder_prompt_len) ** self.length_penalty
+            ret = self.worst_score >= highest_attainable_score
+            return ret
+        # `"never"`: compute the best possible score, depending on the signal of `length_penalty`
+        else:
+            # `length_penalty` > 0.0 -> max denominator is obtaned from `max_length`, not from `cur_len` -> min
+            # abs(`highest_attainable_score`) is obtained -> `highest_attainable_score` is negative, hence we obtain
+            # its max this way
+            if self.length_penalty > 0.0:
+                if self.max_length <= decoder_prompt_len:
+                    raise ValueError("max_length is not larger than decoder prompt length")
+                highest_attainable_score = (
+                    best_sum_logprobs / (self.max_length - decoder_prompt_len) ** self.length_penalty
+                )
+            # the opposite logic applies here (max `highest_attainable_score` from `cur_len`)
+            else:
+                highest_attainable_score = best_sum_logprobs / (cur_len - decoder_prompt_len) ** self.length_penalty
+            ret = self.worst_score >= highest_attainable_score
+            return ret
diff --git a/.venv/lib/python3.11/site-packages/transformers/generation/candidate_generator.py b/.venv/lib/python3.11/site-packages/transformers/generation/candidate_generator.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba5d0f0005a6798a4a8abd83e4610d391ad0cfc7
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/generation/candidate_generator.py
@@ -0,0 +1,871 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple
+
+import numpy as np
+import torch
+
+from ..utils import is_sklearn_available
+
+
+if is_sklearn_available():
+    from sklearn.metrics import roc_curve
+
+from ..cache_utils import DynamicCache
+from ..pytorch_utils import isin_mps_friendly
+from .logits_process import LogitsProcessorList, MinLengthLogitsProcessor
+
+
+if TYPE_CHECKING:
+    from ..modeling_utils import PreTrainedModel
+    from ..tokenization_utils_base import PreTrainedTokenizerBase
+    from .configuration_utils import GenerationConfig
+
+
+class CandidateGenerator:
+    """Abstract base class for all candidate generators that can be applied during assisted generation."""
+
+    def get_candidates(self, input_ids: torch.LongTensor) -> Tuple[torch.LongTensor, Optional[torch.FloatTensor]]:
+        """
+        Fetches the candidates to be tried for the current input.
+
+        Args:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary. [What are input IDs?](../glossary#input-ids)
+
+        Return:
+            `torch.LongTensor` of shape `(batch_size, candidate_length)` containing the candidate sequences to be
+            assessed by the model and, optionally, a `torch.FloatTensor` of shape `(batch_size, candidate_length,
+            vocabulary_size)` containing the logits associated to each candidate.
+        """
+        raise NotImplementedError(
+            f"{self.__class__} is an abstract class. Only classes inheriting this class can call `get_candidates`."
+        )
+
+    def update_candidate_strategy(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, num_matches: int):
+        """
+        Updates the candidate generation strategy based on the outcomes.
+
+        Args:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary. [What are input IDs?](../glossary#input-ids)
+            scores (`torch.FloatTensor` of shape `(batch_size, candidate_length, config.vocab_size)`):
+                Prediction scores of a language modeling head. These can be logits for each vocabulary when not using
+                beam search or log softmax for each vocabulary token when using beam search
+            num_matches (`int`):
+                The number of matches between the candidate sequences and the model predictions.
+        """
+        raise NotImplementedError(
+            f"{self.__class__} is an abstract class. Only classes inheriting this class can call "
+            "`update_candidate_strategy`."
+        )
+
+
+class AssistedCandidateGenerator(CandidateGenerator):
+    """
+    `CandidateGenerator` class to be used for assisted generation and speculative decoding. This class generates
+    candidates through the use of a smaller model. Read the following blog post for more information:
+    https://huggingface.co/blog/assisted-generation
+
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. [What are input IDs?](../glossary#input-ids)
+        assistant_model (`PreTrainedModel`):
+            The model to be used for generating candidates. This model should be smaller than the main model.
+        generation_config (`~generation.GenerationConfig`, *optional*):
+            The generation configuration to be used as base parametrization for the generation call.
+        logits_processor (`LogitsProcessorList`):
+            An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`]
+            used to modify the prediction scores of the language modeling head applied at each generation step.
+        model_kwargs (`Dict`):
+            The keyword arguments that will be passed to the main model, and are used as base inputs for the assistant
+            model as well.
+        inputs_tensor (`torch.Tensor`, *optional*):
+            The model input tensor. In encoder-decoder models, this is the encoder input.
+    """
+
+    def __init__(
+        self,
+        input_ids: torch.LongTensor,
+        assistant_model: "PreTrainedModel",
+        generation_config: "GenerationConfig",
+        model_kwargs: Dict,
+        inputs_tensor: Optional[torch.Tensor] = None,
+        logits_processor: "LogitsProcessorList" = None,
+    ):
+        # Make sure all data at the same device as assistant model
+        device = assistant_model.device
+        input_ids = input_ids.to(device)
+        if inputs_tensor is not None:
+            inputs_tensor = inputs_tensor.to(device)
+
+        # Prepare the assistant and the starting number of candidate tokens
+        self.assistant_model = assistant_model
+        self.num_assistant_tokens = assistant_model.generation_config.num_assistant_tokens
+        self.assistant_confidence_threshold = assistant_model.generation_config.assistant_confidence_threshold
+
+        # Set eos in assistant same as in target model
+        self.assistant_model.generation_config.eos_token_id = generation_config.eos_token_id
+
+        # Prepare the kwargs for the assistant model
+        assistant_kwargs = {}
+        for key, value in model_kwargs.items():  # deepcopy crashes if we attempt to copy encoder outputs with grads
+            if key not in ("encoder_outputs", "assistant_encoder_outputs", "past_key_values"):
+                assistant_kwargs[key] = (
+                    value.detach().to(device) if isinstance(value, torch.Tensor) else copy.deepcopy(value)
+                )
+
+        # Remove potential default "num_logits_to_keep" key
+        if "num_logits_to_keep" in assistant_kwargs.keys() and not assistant_model._supports_num_logits_to_keep():
+            del assistant_kwargs["num_logits_to_keep"]
+
+        if "assistant_encoder_outputs" in model_kwargs:
+            assistant_kwargs["encoder_outputs"] = model_kwargs["assistant_encoder_outputs"]
+        elif assistant_model.config.is_encoder_decoder:
+            inputs_tensor, model_input_name, assistant_kwargs = assistant_model._prepare_model_inputs(
+                inputs_tensor, assistant_model.generation_config.bos_token_id, assistant_kwargs
+            )
+            assistant_kwargs = assistant_model._prepare_encoder_decoder_kwargs_for_generation(
+                inputs_tensor, assistant_kwargs, model_input_name, assistant_model.generation_config
+            )
+        elif "encoder_outputs" in model_kwargs:
+            assistant_kwargs["encoder_outputs"] = model_kwargs["encoder_outputs"]
+        self.assistant_kwargs = assistant_kwargs
+
+        # Prepare assistant model's keys of inputs
+        if assistant_model.config.is_encoder_decoder:
+            # both are encoder-decoder
+            self.input_ids_key = "decoder_input_ids"
+        elif "encoder_outputs" in assistant_kwargs:
+            # special case for encoder-decoder with decoder-only assistant (like DistilWhisper)
+            self.input_ids_key = "input_ids"
+            self.assistant_kwargs["attention_mask"] = self.assistant_kwargs.get(
+                "decoder_attention_mask",
+                torch.ones((input_ids.shape[0], 1), device=input_ids.device, dtype=torch.long),
+            )
+        else:
+            # both are decoder-only
+            self.input_ids_key = "input_ids"
+
+        # Prepare generation-related options.
+        self.logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
+        self.generation_config = copy.deepcopy(generation_config)
+
+        self.generation_config.return_dict_in_generate = True
+        self.generation_config.output_scores = True
+        self.generation_config.assistant_confidence_threshold = self.assistant_confidence_threshold
+        # this flag allow us set the confidence stopping criteria for assistant model generation.
+        self.generation_config.is_assistant = True
+
+        # avoid unnecessary warnings that min_length is larger than max_new_tokens
+        # remove the `MinLengthLogitsProcessor` if exists (NOTE: no need to check for `MinNewTokensLogitsProcessor`)
+        self.main_model_min_length = self.generation_config.min_length
+        self.generation_config.min_length = 0
+        self.generation_config.min_new_tokens = None
+        for processor in self.logits_processor:
+            if isinstance(processor, MinLengthLogitsProcessor):
+                raise ValueError(
+                    "Passing `MinLengthLogitsProcessor` when using `assisted_generation is disabled. "
+                    "Please pass in `min_length` into `.generate()` instead"
+                )
+
+        # We need to roll back the cache in assisted generation, only DynamicCache is supported
+        self.generation_config.cache_implementation = None
+
+        if (
+            is_sklearn_available()
+            and self.assistant_model.generation_config.assistant_confidence_threshold
+            and type(self) is AssistedCandidateGenerator
+        ):
+            self.probs = []
+            self.matches = []
+
+    def get_candidates(self, input_ids: torch.LongTensor) -> Tuple[torch.LongTensor, Optional[torch.FloatTensor]]:
+        """
+        Fetches the candidates to be tried for the current input.
+
+        Args:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary. [What are input IDs?](../glossary#input-ids)
+
+        Return:
+            `torch.LongTensor` of shape `(batch_size, candidate_length)` containing the candidate sequences to be
+            assessed by the model and a `torch.FloatTensor` of shape `(batch_size, candidate_length,
+            vocabulary_size)` containing the logits associated to each candidate.
+        """
+        input_ids = input_ids.to(self.assistant_model.device)
+        # Calculate new tokens to generate
+        min_new_tokens, max_new_tokens = self._calculate_new_tokens(input_ids)
+        if max_new_tokens == 0:
+            return input_ids, None
+        # Update past key values and masks
+        self._update_past_and_masks(input_ids)
+        # Generate candidates
+        generation_args = self._prepare_generation_args(input_ids, min_new_tokens, max_new_tokens)
+        candidate_ids, candidate_logits = self._generate_candidates(generation_args)
+        return candidate_ids, candidate_logits
+
+    def update_candidate_strategy(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, num_matches: int):
+        """
+        Updates the candidate generation strategy based on the outcomes.
+
+        Args:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary. [What are input IDs?](../glossary#input-ids)
+            scores (`torch.FloatTensor` of shape `(batch_size, candidate_length, config.vocab_size)`):
+                Prediction scores of a language modeling head. These can be logits for each vocabulary when not using
+                beam search or log softmax for each vocabulary token when using beam search
+            num_matches (`int`):
+                The number of matches between the candidate sequences and the model predictions.
+        """
+        # Adjust the max number of assistant tokens to use in the next iteration. This is a simple heuristic,
+        # probably can be improved -- we want to balance the benefits of getting assistant tokens correct with the
+        # cost of forecasting incorrect assistant tokens.
+        if self.assistant_model.generation_config.num_assistant_tokens_schedule in {
+            "heuristic",
+            "heuristic_transient",
+        }:
+            # len(scores[0])-1 is the number of candidates according to the target tokenizer.
+            if num_matches == len(scores[0]) - 1:
+                self.num_assistant_tokens += 2.0
+            else:
+                self.num_assistant_tokens = max(1.0, self.num_assistant_tokens - 1.0)
+
+        # The assistant's confidence threshold is adjusted throughout the speculative iterations to reduce the number of unnecessary draft and target forward passes. The costs are estimated based on the ROC curve, which considers the probability of the draft token and its match with the target. A cost of 25% is assigned to false positives and 75% to false negatives.
+        # This adaptation is not compatible with UAG, as it relies on the number of matched tokens based on the draft vocabulary, which is unavailable in UAG.
+        if (
+            is_sklearn_available()
+            and self.assistant_model.generation_config.assistant_confidence_threshold
+            and type(self) is AssistedCandidateGenerator
+        ):
+            # update self.matches
+            self.matches.extend([1] * num_matches)
+            if len(self.probs) > len(self.matches):
+                self.matches.append(0)
+
+            # update self.probs
+            excess_length = len(self.probs) - len(self.matches)
+            if excess_length > 0:
+                del self.probs[-excess_length:]
+
+            if (
+                len(self.probs) > 5 and {0, 1}.issubset(self.matches)
+            ):  # require at least 5 samples to calculate the ROC curve and at least one positive and one negative sample
+                fpr, tpr, thresholds = roc_curve(self.matches, self.probs)
+                fnr = 1 - tpr
+
+                # Calculate the cost for each threshold
+                costs = fpr + 3 * fnr
+
+                # Find the threshold that minimizes the cost
+                optimal_threshold_index = np.argmin(costs)
+                best_threshold = thresholds[optimal_threshold_index]
+
+                self.assistant_model.generation_config.assistant_confidence_threshold = best_threshold
+
+    def _calculate_new_tokens(self, input_ids: torch.LongTensor) -> Tuple[int, int]:
+        """Calculate the minimum and maximum number of new tokens to generate."""
+        new_cur_len = input_ids.shape[-1]
+        max_new_tokens = min(int(self.num_assistant_tokens), self.generation_config.max_length - new_cur_len - 1)
+        min_new_tokens = max(min(max_new_tokens, self.main_model_min_length - new_cur_len), 0)
+        return min_new_tokens, max_new_tokens
+
+    def _update_past_and_masks(self, input_ids: torch.LongTensor, remove_from_pkv: int = 0) -> bool:
+        """Update past key values and attention masks for subsequent generation rounds."""
+        has_past_key_values = self.assistant_kwargs.get("past_key_values", None) is not None
+        if has_past_key_values:
+            new_cache_size = input_ids.shape[-1] - 1 - remove_from_pkv
+            self.assistant_kwargs["past_key_values"] = _crop_past_key_values(
+                self.assistant_model, self.assistant_kwargs["past_key_values"], new_cache_size - 1
+            )
+            self.assistant_kwargs = _prepare_attention_mask(
+                self.assistant_kwargs, input_ids.shape[-1], self.assistant_model.config.is_encoder_decoder
+            )
+            self.assistant_kwargs = _prepare_token_type_ids(self.assistant_kwargs, input_ids.shape[-1])
+        return has_past_key_values
+
+    def _prepare_generation_args(self, input_ids: torch.LongTensor, min_new_tokens: int, max_new_tokens: int) -> Dict:
+        """Prepare arguments for the generation call."""
+        return {
+            self.input_ids_key: input_ids,
+            "min_new_tokens": min_new_tokens,
+            "max_new_tokens": max_new_tokens,
+            "generation_config": self.generation_config,
+            "logits_processor": self.logits_processor,
+        }
+
+    def _generate_candidates(self, generation_args: Dict) -> Tuple[torch.LongTensor, Optional[torch.FloatTensor]]:
+        """Generate candidate sequences using the assistant model."""
+        assistant_output = self.assistant_model.generate(**generation_args, **self.assistant_kwargs)
+        self.assistant_kwargs["past_key_values"] = assistant_output.past_key_values
+        if (
+            is_sklearn_available()
+            and self.assistant_model.generation_config.assistant_confidence_threshold
+            and type(self) is AssistedCandidateGenerator
+        ):
+            scores_tensor = torch.cat(assistant_output.scores, dim=0)
+            scores_softmax = torch.softmax(scores_tensor, dim=-1)
+            ids = assistant_output.sequences[-1, -len(assistant_output.scores) :]
+            p = scores_softmax[range(len(ids)), ids]
+            self.probs.extend(p.tolist())
+        candidate_logits = torch.stack(assistant_output.scores, dim=1)
+        candidate_ids = assistant_output.sequences
+        return candidate_ids, candidate_logits
+
+
+class AssistedCandidateGeneratorDifferentTokenizers(AssistedCandidateGenerator):
+    """
+    `CandidateGenerator` class to be used for Universal Assisted Generation (UAD): assisted generation with different tokenizers
+    for the assistant and main models. This class generates candidates through the use of a smaller
+    model.
+
+    The main model input tokens are re-encoded into assistant model tokens, then candidate tokens are generated in the assistant encoding, which are
+    in turn re-encoded into main model candidate tokens. Validation then proceeds as explained above.
+    The re-encoding steps involve decoding token ids into text and then encoding the text using a different tokenizer.
+    Since re-encoding the tokens may result in tokenization discrepancies, UAD finds the longest common subsequence between the source and target encodings,
+    to ensure the new tokens include the correct prompt suffix.
+
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. [What are input IDs?](../glossary#input-ids)
+        assistant_model (`PreTrainedModel`):
+            The model to be used for generating candidates. This model should be smaller than the main model.
+        target_tokenizer (`PreTrainedTokenizerBase`):
+            The tokenizer used for the target model.
+        assistant_tokenizer (`PreTrainedTokenizerBase`):
+            The tokenizer used for the assistant model.
+        generation_config (`~generation.GenerationConfig`, *optional*):
+            The generation configuration to be used as base parametrization for the generation call.
+        logits_processor (`LogitsProcessorList`):
+            An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`]
+            used to modify the prediction scores of the language modeling head applied at each generation step.
+        model_kwargs (`Dict`):
+            The keyword arguments that will be passed to the main model, and are used as base inputs for the assistant
+            model as well.
+        inputs_tensor (`torch.Tensor`, *optional*):
+            The model input tensor. In encoder-decoder models, this is the encoder input.
+    """
+
+    def __init__(
+        self,
+        input_ids: torch.LongTensor,
+        assistant_model: "PreTrainedModel",
+        target_tokenizer: "PreTrainedTokenizerBase",
+        assistant_tokenizer: "PreTrainedTokenizerBase",
+        generation_config: "GenerationConfig",
+        model_kwargs: Dict,
+        inputs_tensor: Optional[torch.Tensor] = None,
+        logits_processor: "LogitsProcessorList" = None,
+    ):
+        super().__init__(input_ids, assistant_model, generation_config, model_kwargs, inputs_tensor, logits_processor)
+
+        self.target_tokenizer = target_tokenizer
+        self.assistant_tokenizer = assistant_tokenizer
+        self.prev_target_ids_len: Optional[int] = None
+        self.prev_assistant_ids = None
+        self.target_lookbehind = assistant_model.generation_config.target_lookbehind
+        self.assistant_lookbehind = assistant_model.generation_config.assistant_lookbehind
+
+    @staticmethod
+    def _get_longest_diag_dict(input_matrix, nonzero_idx):
+        """
+        Calculates the length of the longest diagonal sequence in a given matrix.
+        Args:
+            input_matrix (torch.Tensor): The input matrix.
+            nonzero_idx (torch.Tensor): The indices of the non-zero elements in the matrix.
+        Returns:
+            dict: A dictionary where the keys are the indices of the non-zero elements and the values are the lengths of the longest diagonal sequences starting from those indices.
+        """
+
+        visited = set()
+        diags = {}
+        for idx in nonzero_idx:
+            start_idx = torch.clone(idx)
+            tuple_start_idx = tuple(start_idx.tolist())
+
+            if tuple_start_idx in visited:
+                continue
+
+            visited.add(tuple_start_idx)
+            cur_diag_len = 1
+            start_idx += 1
+            while start_idx[0] < input_matrix.shape[0] and start_idx[1] < input_matrix.shape[1]:
+                tuple_start_idx = tuple(start_idx.tolist())
+                visited.add(tuple_start_idx)
+
+                if input_matrix[start_idx[0], start_idx[1]] == 1:
+                    cur_diag_len += 1
+                    start_idx += 1
+                else:
+                    break
+
+            diags[idx] = cur_diag_len
+        return diags
+
+    @staticmethod
+    def _get_longest_diag_index(input_matrix):
+        """
+        Returns the start index and length of the longest diagonal in the given input.
+        Args:
+            input_matrix (numpy.ndarray): The input matrix.
+        Returns:
+            tuple: A tuple containing the start index and length of the longest diagonal.
+        """
+
+        diags = AssistedCandidateGeneratorDifferentTokenizers._get_longest_diag_dict(
+            input_matrix, input_matrix.nonzero()
+        )
+        diags_values = list(diags.values())
+        diags_keys = list(diags.keys())
+        best_diag = np.argmax(diags_values)
+        diag_start_index = diags_keys[best_diag]
+        diag_start_length = diags_values[best_diag]
+        return diag_start_index, diag_start_length
+
+    @staticmethod
+    def _get_tokens_diag(prompt, prompt_plus_new_tokens):
+        """
+        Input:
+            prompt: 2D array of shape (batch_size, prompt_length), represents the original prompt tokens
+            prompt_plus_new_tokens: 2D array of shape (batch_size, prompt_length), represents the suffix of the original prompt, with additional new tokens.
+        Output:
+            discrepancy_length: int, represents the number of tokens that need to be replaced from prompt
+            new_tokens_only: 2D array of shape (batch_size, new_token_length), represents the new tokens that are not in prompt
+            discrepancy_only: 2D array of shape (batch_size, discrepancy_length), represents the new tokens that are in prompt but not in prompt_plus_new_tokens
+        """
+        compare_mat = prompt_plus_new_tokens.T == prompt
+        if not torch.is_tensor(compare_mat):
+            compare_mat = torch.tensor(compare_mat)
+
+        compare_mat_int = compare_mat.to(int)
+
+        if not compare_mat_int.any().item():
+            # empty intersection between prompt and prompt_plus_new_tokens
+            return None, None, None
+
+        longest_location, longest_diag_length = AssistedCandidateGeneratorDifferentTokenizers._get_longest_diag_index(
+            compare_mat_int
+        )
+        new_token_start_index = longest_location[0] + longest_diag_length
+        discrepancy_with_old = longest_location[1] + longest_diag_length
+        discrepancy_length = (prompt.shape[1] - discrepancy_with_old).item()
+        new_tokens_only = prompt_plus_new_tokens[:, new_token_start_index + discrepancy_length :]
+        discrepancy_only = prompt_plus_new_tokens[
+            :, new_token_start_index : new_token_start_index + discrepancy_length
+        ]
+        return discrepancy_length, new_tokens_only, discrepancy_only
+
+    def convert_source_tokens_to_target_tokens(
+        self,
+        input_ids,
+        source_tokenizer,
+        destination_tokenizer,
+    ):
+        """
+        Convert token IDs from one tokenizer to another.
+        Args:
+            input_ids: The input token IDs.
+            source_tokenizer: The source tokenizer.
+            destination_tokenizer: The destination tokenizer.
+        Returns:
+            The converted token IDs.
+        """
+        text = source_tokenizer.batch_decode(input_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
+        dest_ids = destination_tokenizer(text, add_special_tokens=True, return_tensors="pt")["input_ids"]
+        return dest_ids.to(input_ids.device)
+
+    def get_candidates(self, input_ids: torch.LongTensor) -> Tuple[torch.LongTensor, Optional[torch.FloatTensor]]:
+        """
+        Fetches the candidates to be tried for the current input.
+
+        Args:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary. [What are input IDs?](../glossary#input-ids)
+
+        Return:
+            `torch.LongTensor` of shape `(batch_size, candidate_length)` containing the candidate sequences to be
+            assessed by the model and a `torch.FloatTensor` of shape `(batch_size, candidate_length,
+            vocabulary_size)` containing the logits associated to each candidate.
+        """
+        max_new_tokens = int(self.num_assistant_tokens)
+        if max_new_tokens == 0:
+            return input_ids, None
+
+        input_ids = input_ids.to(self.assistant_model.device)
+        remove_from_pkv = 0
+
+        assistant_input_ids, remove_from_pkv = self._prepare_assistant_input_ids(input_ids)
+        self.prev_assistant_ids = assistant_input_ids
+
+        min_new_tokens = max(min(max_new_tokens, self.main_model_min_length - assistant_input_ids.shape[-1]), 0)
+
+        self._update_past_and_masks(assistant_input_ids, remove_from_pkv)
+        generation_args = self._prepare_generation_args(assistant_input_ids, min_new_tokens, max_new_tokens)
+        self.assistant_kwargs.pop("attention_mask", None)
+
+        assistant_output = self.assistant_model.generate(**generation_args, **self.assistant_kwargs)
+        new_target_ids = self._process_assistant_outputs(input_ids, assistant_output.sequences, assistant_input_ids)
+
+        # Update state
+        self.prev_target_ids_len = input_ids.shape[1]
+        self.assistant_kwargs["past_key_values"] = assistant_output.past_key_values
+        self.prev_assistant_ids = assistant_output.sequences
+
+        if self.prev_target_ids_len >= new_target_ids.shape[1]:
+            return input_ids, None
+
+        return new_target_ids, None
+
+    def _prepare_assistant_input_ids(self, input_ids: torch.LongTensor) -> Tuple[torch.LongTensor, int]:
+        """Converts target input IDs to assistant input IDs, handling discrepancies."""
+        convert_kwargs = {
+            "source_tokenizer": self.target_tokenizer,
+            "destination_tokenizer": self.assistant_tokenizer,
+        }
+        remove_from_pkv = 0
+
+        if self.prev_assistant_ids is not None and self.prev_target_ids_len > self.target_lookbehind:
+            # input_ids contains all target prompt input ids and some new target input ids
+            start_index_in_target_window = self.prev_target_ids_len - self.target_lookbehind
+
+            new_assistant_ids = self.convert_source_tokens_to_target_tokens(
+                input_ids[:, start_index_in_target_window:], **convert_kwargs
+            )
+            prompt_use_length = new_assistant_ids.shape[1]
+            prompt_use = self.prev_assistant_ids[:, -prompt_use_length:]
+
+            discrepancy_length, new_tokens_only, discrepancy_only = self._get_tokens_diag(
+                prompt_use, new_assistant_ids
+            )
+            assistant_input_ids = self.prev_assistant_ids
+
+            if new_tokens_only is not None:
+                if discrepancy_length > 0 and discrepancy_only.shape[1] > 0:
+                    if discrepancy_length == discrepancy_only.shape[1]:
+                        assistant_input_ids[:, -discrepancy_length:] = discrepancy_only
+
+                    elif discrepancy_length > discrepancy_only.shape[1]:
+                        discrepancy_length_diff = discrepancy_length - discrepancy_only.shape[1]
+                        assistant_input_ids = assistant_input_ids[:, :-discrepancy_length_diff]
+                        assistant_input_ids[:, -discrepancy_only.shape[1] :] = discrepancy_only
+
+                    remove_from_pkv = discrepancy_length
+
+                if new_tokens_only.shape[1] > 0:
+                    assistant_input_ids = torch.cat([assistant_input_ids, new_tokens_only], dim=-1)
+            else:
+                # edge case: in case of no intersection between prompt and new_assistant_ids
+                assistant_input_ids = torch.cat([assistant_input_ids, new_assistant_ids], dim=-1)
+        else:
+            assistant_input_ids = self.convert_source_tokens_to_target_tokens(input_ids, **convert_kwargs)
+            self.prev_target_ids_len = input_ids.shape[1]
+
+        return assistant_input_ids, remove_from_pkv
+
+    def _process_assistant_outputs(
+        self, input_ids: torch.LongTensor, assistant_sequences: torch.LongTensor, assistant_input_ids: torch.LongTensor
+    ) -> torch.LongTensor:
+        """Processes assistant outputs to obtain target input IDs."""
+        num_prev_assistant = self.prev_assistant_ids.shape[1]
+        start_assistant_look_index = num_prev_assistant - self.assistant_lookbehind
+
+        new_target_ids_from_window = self.convert_source_tokens_to_target_tokens(
+            assistant_sequences[:, start_assistant_look_index:],
+            source_tokenizer=self.assistant_tokenizer,
+            destination_tokenizer=self.target_tokenizer,
+        )
+        target_prompt_use_length = new_target_ids_from_window.shape[1]
+
+        target_prompt_use = input_ids[:, -target_prompt_use_length:]
+
+        _, target_new_tokens_only, _ = self._get_tokens_diag(target_prompt_use, new_target_ids_from_window)
+
+        new_target_ids = input_ids
+
+        if target_new_tokens_only is not None:
+            if target_new_tokens_only.shape[1] > 0:
+                new_target_ids = torch.cat([new_target_ids, target_new_tokens_only], dim=-1)
+        else:
+            # edge case: in case of no intersection between prompt and new_target_ids
+            new_target_ids = torch.cat([new_target_ids, new_target_ids_from_window], dim=-1)
+
+        if hasattr(self.generation_config, "max_length"):
+            new_target_ids = new_target_ids[:, : self.generation_config.max_length]
+
+        return new_target_ids
+
+
+class PromptLookupCandidateGenerator(CandidateGenerator):
+    """
+    `CandidateGenerator` class to be used for prompt lookup generation. This class generates candidates by looking up
+    likely continuations in the provided prompt (input_ids) itself.
+    Read the following blog post for more information: https://github.com/apoorvumang/prompt-lookup-decoding
+
+    Args:
+        max_matching_ngram_size (`int`):
+            The maximum ngram size to be considered for matching in the prompt
+        num_output_tokens (`int`):
+            The number of tokens to be output as candidate tokens.
+        max_length (`int`):
+            The number of total maximum tokens that can be generated. For decoder-only models that includes the prompt length.
+            Defaults to 20, which is the max length used as default in generation config.
+    """
+
+    def __init__(
+        self,
+        eos_token_id: torch.Tensor = None,
+        num_output_tokens: int = 10,
+        max_matching_ngram_size: int = None,
+        max_length: int = 20,
+    ):
+        self.num_output_tokens = num_output_tokens
+        self.max_matching_ngram_size = max_matching_ngram_size if max_matching_ngram_size else 2
+        self.max_length = max_length
+        self.eos_token_id = eos_token_id
+
+        if self.max_matching_ngram_size <= 0 or self.num_output_tokens <= 0:
+            raise ValueError("Invalid max_matching_ngram_size or num_output_tokens")
+
+    def get_candidates(self, input_ids: torch.LongTensor) -> Tuple[torch.LongTensor, Optional[torch.FloatTensor]]:
+        """
+        Fetches the candidates to be tried for the current input.
+
+        Args:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary. [What are input IDs?](../glossary#input-ids)
+
+        Return:
+            `torch.LongTensor` of shape `(num_candidates, candidate_length)`: The candidate sequences to be tried.
+        """
+        input_length = input_ids.size(1)
+
+        # Don't generate more than `max_length - 1` candidates since the target model generates one extra token.
+        if self.max_length == input_length + 1:
+            return input_ids, None
+
+        chosen_ids = None
+        match_found = False
+        for ngram_size in range(min(self.max_matching_ngram_size, input_length - 1), 0, -1):
+            # Create sliding windows of size ngram_size
+            windows = input_ids.unfold(dimension=1, size=ngram_size, step=1)
+
+            # Convert ngram to a tensor for comparison
+            ngram_tensor = input_ids[0, -ngram_size:]
+
+            # Find where the windows match the ngram
+            matches = (windows == ngram_tensor).all(dim=2)
+
+            # Get the indices of matches
+            match_indices = matches.nonzero(as_tuple=True)[1]
+
+            # Iterate through match indices to find a valid continuation
+            for idx in match_indices:
+                start_idx = idx + ngram_size
+                end_idx = start_idx + self.num_output_tokens
+                end_idx = min(end_idx, input_length, self.max_length)
+
+                if start_idx < end_idx:
+                    chosen_ids = input_ids[0, start_idx:end_idx]
+                    match_found = True
+
+                    # remove remaining candidate ids if an "eos" token is found, otherwise the target model may
+                    # accept eos and the rest as valid, thus not stopping generation after "eos"
+                    # NOTE: below code is written based on the fact that assisted decoding supports only bs=1
+                    mask = isin_mps_friendly(chosen_ids, self.eos_token_id)
+                    match_indices_eos = torch.nonzero(mask)
+                    if match_indices_eos.numel() > 0:
+                        first_eos_index = match_indices_eos[0].item()
+                        chosen_ids = chosen_ids[:first_eos_index]
+                    break
+            if match_found:
+                break
+
+        if chosen_ids is None or len(chosen_ids) == 0:
+            # In case we didn't find a match return the input sequence unchanged, reverts back to autoregressive decoding
+            return input_ids, None
+
+        # Now need extend input_ids with chosen_ids
+        chosen_ids = chosen_ids.unsqueeze(0)
+        candidate_input_ids = torch.cat((input_ids, chosen_ids), dim=1)
+        # assisted_generation expects logits as well, but we don't have those here, so returning None
+        return candidate_input_ids, None
+
+    def update_candidate_strategy(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, num_matches: int):
+        """
+        Updates the candidate generation strategy based on the outcomes.
+
+        Args:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary. [What are input IDs?](../glossary#input-ids)
+            scores (`torch.FloatTensor` of shape `(batch_size, candidate_length, config.vocab_size)`):
+                Prediction scores of a language modeling head. These can be logits for each vocabulary when not using
+                beam search or log softmax for each vocabulary token when using beam search
+            num_matches (`int`):
+                The number of matches between the candidate sequences and the model predictions.
+        """
+        # Currently does nothing
+        return
+
+
+class EarlyExitCandidateGenerator(AssistedCandidateGenerator):
+    """
+    `CandidateGenerator` class to be used for assisted generation and speculative decoding. This class generates
+    candidates through the use of **the model itself**, exiting early. Can only be used with models that support early
+    exit, e.g., `facebook/layerskip-llama3.2-1B`.
+
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. [What are input IDs?](../glossary#input-ids)
+        assistant_model (`PreTrainedModel`):
+            The original model. This model must support early exit (i.e. is trained to compute logits in earlier
+            layers).
+        generation_config (`~generation.GenerationConfig`, *optional*):
+            The generation configuration to be used as base parametrization for the generation call.
+        logits_processor (`LogitsProcessorList`):
+            An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`]
+            used to modify the prediction scores of the language modeling head applied at each generation step.
+        model_kwargs (`Dict`):
+            The keyword arguments that will be passed to the main model, and are used as base inputs for the assistant
+            model as well.
+        inputs_tensor (`torch.Tensor`, *optional*):
+            The model input tensor. In encoder-decoder models, this is the encoder input.
+    """
+
+    def __init__(
+        self,
+        input_ids: torch.LongTensor,
+        assistant_model: "PreTrainedModel",
+        generation_config: "GenerationConfig",
+        model_kwargs: Dict,
+        inputs_tensor: Optional[torch.Tensor] = None,
+        logits_processor: "LogitsProcessorList" = None,
+    ):
+        super().__init__(
+            input_ids=input_ids,
+            assistant_model=assistant_model,
+            generation_config=generation_config,
+            model_kwargs=model_kwargs,
+            inputs_tensor=inputs_tensor,
+            logits_processor=logits_processor,
+        )
+        # We have to move early exit out of the generation config, otherwise the assistant will also call `generate`
+        # with early exit
+        self.assistant_early_exit = self.generation_config.assistant_early_exit
+        self.generation_config.assistant_early_exit = None
+
+    def get_candidates(self, input_ids: torch.LongTensor) -> Tuple[torch.LongTensor, Optional[torch.FloatTensor]]:
+        # Temporarily sets the number of hidden layers to the early exit value
+        base_model = getattr(self.assistant_model, self.assistant_model.base_model_prefix)
+        original_num_hidden_layers = base_model.config.num_hidden_layers
+        base_model.config.num_hidden_layers = self.assistant_early_exit
+        candidate_ids, candidate_logits = super().get_candidates(input_ids)
+        base_model.config.num_hidden_layers = original_num_hidden_layers
+        return candidate_ids, candidate_logits
+
+
+def _crop_past_key_values(model, past_key_values, max_length):
+    """Crops the past key values up to a certain maximum length."""
+    new_past = []
+    if model.config.is_encoder_decoder:
+        for idx in range(len(past_key_values)):
+            new_past.append(
+                (
+                    past_key_values[idx][0][:, :, :max_length, :],
+                    past_key_values[idx][1][:, :, :max_length, :],
+                    past_key_values[idx][2],
+                    past_key_values[idx][3],
+                )
+            )
+        past_key_values = tuple(new_past)
+    # gptbigcode is special and stores kv in shape (batch_size, seq_len, dim), if it's a multi_query model
+    elif "gptbigcode" in model.__class__.__name__.lower() or (
+        model.config.architectures is not None and "gptbigcode" in model.config.architectures[0].lower()
+    ):
+        if model.config.multi_query:
+            for idx in range(len(past_key_values)):
+                past_key_values[idx] = past_key_values[idx][:, :max_length, :]
+        else:
+            for idx in range(len(past_key_values)):
+                past_key_values[idx] = past_key_values[idx][:, :, :max_length, :]
+    elif isinstance(past_key_values, DynamicCache):
+        past_key_values.crop(max_length)
+    elif past_key_values is not None:
+        for idx in range(len(past_key_values)):
+            if past_key_values[idx] != ([], []):
+                new_past.append(
+                    (
+                        past_key_values[idx][0][:, :, :max_length, :],
+                        past_key_values[idx][1][:, :, :max_length, :],
+                    )
+                )
+            else:
+                new_past.append((past_key_values[idx][0], past_key_values[idx][1]))
+        past_key_values = tuple(new_past)
+    return past_key_values
+
+
+def _prepare_attention_mask(model_kwargs: Dict[str, Any], new_length: int, is_encoder_decoder: bool) -> Dict[str, Any]:
+    """Expands or crops the model's mask for decoding purposes, to the defined length"""
+
+    mask_key = "decoder_attention_mask" if is_encoder_decoder else "attention_mask"
+    if mask_key not in model_kwargs:
+        return model_kwargs
+
+    mask = model_kwargs[mask_key]
+    mask_length_diff = new_length - mask.shape[1]
+
+    if mask_length_diff < 0:
+        model_kwargs[mask_key] = mask[:, :mask_length_diff]
+    elif mask_length_diff > 0:
+        model_kwargs[mask_key] = torch.cat([mask, mask.new_ones((mask.shape[0], mask_length_diff))], dim=-1)
+
+    # Handle cross attention models
+    if "cross_attention_mask" in model_kwargs:
+        # Mllama case
+        cross_mask = model_kwargs["cross_attention_mask"]
+        if mask_length_diff < 0:
+            model_kwargs["cross_attention_mask"] = cross_mask[:, :mask_length_diff]
+        elif mask_length_diff > 0:
+            new_mask = cross_mask[:, -1:, :, :].repeat(1, mask_length_diff, 1, 1)
+            model_kwargs["cross_attention_mask"] = torch.cat([cross_mask, new_mask], dim=1)
+    elif "image_attention_mask" in model_kwargs:
+        # IDEFICS case
+        cross_mask = model_kwargs["image_attention_mask"]
+        if mask_length_diff < 0:
+            model_kwargs["image_attention_mask"] = cross_mask[:, :mask_length_diff]
+        elif mask_length_diff > 0:
+            new_mask = cross_mask[:, -1:, :].repeat(1, mask_length_diff, 1)
+            model_kwargs["image_attention_mask"] = torch.cat([cross_mask, new_mask], dim=1)
+
+    return model_kwargs
+
+
+def _prepare_token_type_ids(model_kwargs: Dict[str, Any], new_length: int) -> Dict[str, Any]:
+    """Expands or crops the model's token_type_ids for decoding purposes, to the defined length"""
+    if "token_type_ids" not in model_kwargs or model_kwargs["token_type_ids"] is None:
+        return model_kwargs
+
+    token_type_ids = model_kwargs["token_type_ids"]
+    final_token_type = token_type_ids[:, -1].unsqueeze(-1)
+    type_length_diff = new_length - token_type_ids.shape[1]
+
+    if type_length_diff < 0:
+        token_type_ids = token_type_ids[:, :type_length_diff]
+    elif type_length_diff > 0:
+        token_type_copies = final_token_type.repeat(1, type_length_diff)
+        model_kwargs["token_type_ids"] = torch.cat([model_kwargs["token_type_ids"], token_type_copies], dim=-1)
+    return model_kwargs
diff --git a/.venv/lib/python3.11/site-packages/transformers/generation/configuration_utils.py b/.venv/lib/python3.11/site-packages/transformers/generation/configuration_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..18cf26b8b73415411e20e75c2e953258677e5159
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/generation/configuration_utils.py
@@ -0,0 +1,1628 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Generation configuration class and utilities."""
+
+import copy
+import json
+import os
+import warnings
+from abc import ABC, abstractmethod
+from dataclasses import dataclass, is_dataclass
+from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union
+
+from .. import __version__
+from ..configuration_utils import PretrainedConfig
+from ..utils import (
+    GENERATION_CONFIG_NAME,
+    ExplicitEnum,
+    PushToHubMixin,
+    cached_file,
+    download_url,
+    extract_commit_hash,
+    is_remote_url,
+    is_torch_available,
+    logging,
+)
+
+
+if TYPE_CHECKING:
+    from ..modeling_utils import PreTrainedModel
+
+
+logger = logging.get_logger(__name__)
+METADATA_FIELDS = ("_from_model_config", "_commit_hash", "_original_object_hash", "transformers_version")
+NEEDS_CACHE_CONFIG = {}
+NEED_SETUP_CACHE_CLASSES_MAPPING = {}
+QUANT_BACKEND_CLASSES_MAPPING = {}
+ALL_CACHE_IMPLEMENTATIONS = []
+
+if is_torch_available():
+    from ..cache_utils import (
+        HQQQuantizedCache,
+        HybridCache,
+        MambaCache,
+        OffloadedStaticCache,
+        QuantizedCacheConfig,
+        QuantoQuantizedCache,
+        SlidingWindowCache,
+        StaticCache,
+        StaticCacheConfig,
+    )
+    from .logits_process import SynthIDTextWatermarkLogitsProcessor, WatermarkLogitsProcessor
+
+    NEEDS_CACHE_CONFIG["quantized"] = QuantizedCacheConfig
+    NEEDS_CACHE_CONFIG["static"] = StaticCacheConfig
+    NEED_SETUP_CACHE_CLASSES_MAPPING = {
+        "static": StaticCache,
+        "offloaded_static": OffloadedStaticCache,
+        "sliding_window": SlidingWindowCache,
+        "hybrid": HybridCache,
+        "mamba": MambaCache,
+    }
+    QUANT_BACKEND_CLASSES_MAPPING = {"quanto": QuantoQuantizedCache, "HQQ": HQQQuantizedCache}
+    ALL_CACHE_IMPLEMENTATIONS = (
+        list(NEED_SETUP_CACHE_CLASSES_MAPPING.keys()) + list(NEEDS_CACHE_CONFIG.keys()) + ["offloaded"]
+    )
+
+
+class GenerationMode(ExplicitEnum):
+    """
+    Possible generation modes, downstream of the [`~generation.GenerationMixin.generate`] method.
+    """
+
+    # Non-beam methods
+    CONTRASTIVE_SEARCH = "contrastive_search"
+    GREEDY_SEARCH = "greedy_search"
+    SAMPLE = "sample"
+    ASSISTED_GENERATION = "assisted_generation"
+    DOLA_GENERATION = "dola_generation"
+    # Beam methods
+    BEAM_SEARCH = "beam_search"
+    BEAM_SAMPLE = "beam_sample"
+    CONSTRAINED_BEAM_SEARCH = "constrained_beam_search"
+    GROUP_BEAM_SEARCH = "group_beam_search"
+
+
+class GenerationConfig(PushToHubMixin):
+    # no-format
+    """
+    Class that holds a configuration for a generation task. A `generate` call supports the following generation methods
+    for text-decoder, text-to-text, speech-to-text, and vision-to-text models:
+
+        - *greedy decoding* if `num_beams=1` and `do_sample=False`
+        - *contrastive search* if `penalty_alpha>0.` and `top_k>1`
+        - *multinomial sampling* if `num_beams=1` and `do_sample=True`
+        - *beam-search decoding* if `num_beams>1` and `do_sample=False`
+        - *beam-search multinomial sampling* if `num_beams>1` and `do_sample=True`
+        - *diverse beam-search decoding* if `num_beams>1` and `num_beam_groups>1`
+        - *constrained beam-search decoding* if `constraints!=None` or `force_words_ids!=None`
+        - *assisted decoding* if `assistant_model` or `prompt_lookup_num_tokens` is passed to `.generate()`
+        - *dola decoding* if `dola_layers` is passed to `.generate()`
+
+    To learn more about decoding strategies refer to the [text generation strategies guide](../generation_strategies).
+
+    <Tip>
+
+    A large number of these flags control the logits or the stopping criteria of the generation. Make sure you check
+    the [generate-related classes](https://huggingface.co/docs/transformers/internal/generation_utils) for a full
+    description of the possible manipulations, as well as examples of their usage.
+
+    </Tip>
+
+    Arg:
+        > Parameters that control the length of the output
+
+        max_length (`int`, *optional*, defaults to 20):
+            The maximum length the generated tokens can have. Corresponds to the length of the input prompt +
+            `max_new_tokens`. Its effect is overridden by `max_new_tokens`, if also set.
+        max_new_tokens (`int`, *optional*):
+            The maximum numbers of tokens to generate, ignoring the number of tokens in the prompt.
+        min_length (`int`, *optional*, defaults to 0):
+            The minimum length of the sequence to be generated. Corresponds to the length of the input prompt +
+            `min_new_tokens`. Its effect is overridden by `min_new_tokens`, if also set.
+        min_new_tokens (`int`, *optional*):
+            The minimum numbers of tokens to generate, ignoring the number of tokens in the prompt.
+        early_stopping (`bool` or `str`, *optional*, defaults to `False`):
+            Controls the stopping condition for beam-based methods, like beam-search. It accepts the following values:
+            `True`, where the generation stops as soon as there are `num_beams` complete candidates; `False`, where an
+            heuristic is applied and the generation stops when is it very unlikely to find better candidates;
+            `"never"`, where the beam search procedure only stops when there cannot be better candidates (canonical
+            beam search algorithm).
+        max_time (`float`, *optional*):
+            The maximum amount of time you allow the computation to run for in seconds. generation will still finish
+            the current pass after allocated time has been passed.
+        stop_strings (`str or List[str]`, *optional*):
+            A string or a list of strings that should terminate generation if the model outputs them.
+
+        > Parameters that control the generation strategy used
+
+        do_sample (`bool`, *optional*, defaults to `False`):
+            Whether or not to use sampling ; use greedy decoding otherwise.
+        num_beams (`int`, *optional*, defaults to 1):
+            Number of beams for beam search. 1 means no beam search.
+        num_beam_groups (`int`, *optional*, defaults to 1):
+            Number of groups to divide `num_beams` into in order to ensure diversity among different groups of beams.
+            [this paper](https://arxiv.org/pdf/1610.02424.pdf) for more details.
+        penalty_alpha (`float`, *optional*):
+            The values balance the model confidence and the degeneration penalty in contrastive search decoding.
+        dola_layers (`str` or `List[int]`, *optional*):
+            The layers to use for DoLa decoding. If `None`, DoLa decoding is not used. If a string, it must
+            be one of "low" or "high", which means using the lower part or higher part of the model layers, respectively.
+            "low" means the first half of the layers up to the first 20 layers, and "high" means the last half of the
+            layers up to the last 20 layers.
+            If a list of integers, it must contain the indices of the layers to use for candidate premature layers in DoLa.
+            The 0-th layer is the word embedding layer of the model. Set to `'low'` to improve long-answer reasoning tasks,
+            `'high'` to improve short-answer tasks. Check the [documentation](https://github.com/huggingface/transformers/blob/main/docs/source/en/generation_strategies.md)
+            or [the paper](https://arxiv.org/abs/2309.03883) for more details.
+
+        > Parameters that control the cache
+
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should use the past last key/values attentions (if applicable to the model) to
+            speed up decoding.
+        cache_implementation (`str`, *optional*, default to `None`):
+            Name of the cache class that will be instantiated in `generate`, for faster decoding. Possible values are:
+
+            - `"static"`: [`StaticCache`]
+            - `"offloaded_static"`: [`OffloadedStaticCache`]
+            - `"sliding_window"`: [`SlidingWindowCache`]
+            - `"hybrid"`: [`HybridCache`]
+            - `"mamba"`: [`MambaCache`]
+            - `"quantized"`: [`QuantizedCache`]
+
+            We support other cache types, but they must be manually instantiated and
+            passed to `generate` through the `past_key_values` argument. See our
+            [cache documentation](https://huggingface.co/docs/transformers/en/kv_cache) for further information.
+        cache_config (`CacheConfig` or `dict`, *optional*, default to `None`):
+            Arguments used in the key-value cache class can be passed in `cache_config`. Can be passed as a `Dict` and
+            it will be converted to its repsective `CacheConfig` internally.
+            Otherwise can be passed as a `CacheConfig` class matching the indicated `cache_implementation`.
+        return_legacy_cache (`bool`, *optional*, default to `True`):
+            Whether to return the legacy or new format of the cache when `DynamicCache` is used by default.
+
+        > Parameters for manipulation of the model output logits
+
+        temperature (`float`, *optional*, defaults to 1.0):
+            The value used to module the next token probabilities. This value is set in a model's `generation_config.json` file. If it isn't set, the default value is 1.0
+        top_k (`int`, *optional*, defaults to 50):
+            The number of highest probability vocabulary tokens to keep for top-k-filtering. This value is set in a model's `generation_config.json` file. If it isn't set, the default value is 50.
+        top_p (`float`, *optional*, defaults to 1.0):
+            If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to
+            `top_p` or higher are kept for generation. This value is set in a model's `generation_config.json` file. If it isn't set, the default value is 1.0
+        min_p (`float`, *optional*):
+            Minimum token probability, which will be scaled by the probability of the most likely token. It must be a
+            value between 0 and 1. Typical values are in the 0.01-0.2 range, comparably selective as setting `top_p` in
+            the 0.99-0.8 range (use the opposite of normal `top_p` values).
+        typical_p (`float`, *optional*, defaults to 1.0):
+            Local typicality measures how similar the conditional probability of predicting a target token next is to
+            the expected conditional probability of predicting a random token next, given the partial text already
+            generated. If set to float < 1, the smallest set of the most locally typical tokens with probabilities that
+            add up to `typical_p` or higher are kept for generation. See [this
+            paper](https://arxiv.org/pdf/2202.00666.pdf) for more details.
+        epsilon_cutoff (`float`, *optional*, defaults to 0.0):
+            If set to float strictly between 0 and 1, only tokens with a conditional probability greater than
+            `epsilon_cutoff` will be sampled. In the paper, suggested values range from 3e-4 to 9e-4, depending on the
+            size of the model. See [Truncation Sampling as Language Model
+            Desmoothing](https://arxiv.org/abs/2210.15191) for more details.
+        eta_cutoff (`float`, *optional*, defaults to 0.0):
+            Eta sampling is a hybrid of locally typical sampling and epsilon sampling. If set to float strictly between
+            0 and 1, a token is only considered if it is greater than either `eta_cutoff` or `sqrt(eta_cutoff) *
+            exp(-entropy(softmax(next_token_logits)))`. The latter term is intuitively the expected next token
+            probability, scaled by `sqrt(eta_cutoff)`. In the paper, suggested values range from 3e-4 to 2e-3,
+            depending on the size of the model. See [Truncation Sampling as Language Model
+            Desmoothing](https://arxiv.org/abs/2210.15191) for more details.
+        diversity_penalty (`float`, *optional*, defaults to 0.0):
+            This value is subtracted from a beam's score if it generates a token same as any beam from other group at a
+            particular time. Note that `diversity_penalty` is only effective if `group beam search` is enabled.
+        repetition_penalty (`float`, *optional*, defaults to 1.0):
+            The parameter for repetition penalty. 1.0 means no penalty. See [this
+            paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
+        encoder_repetition_penalty (`float`, *optional*, defaults to 1.0):
+            The paramater for encoder_repetition_penalty. An exponential penalty on sequences that are not in the
+            original input. 1.0 means no penalty.
+        length_penalty (`float`, *optional*, defaults to 1.0):
+            Exponential penalty to the length that is used with beam-based generation. It is applied as an exponent to
+            the sequence length, which in turn is used to divide the score of the sequence. Since the score is the log
+            likelihood of the sequence (i.e. negative), `length_penalty` > 0.0 promotes longer sequences, while
+            `length_penalty` < 0.0 encourages shorter sequences.
+        no_repeat_ngram_size (`int`, *optional*, defaults to 0):
+            If set to int > 0, all ngrams of that size can only occur once.
+        bad_words_ids (`List[List[int]]`, *optional*):
+            List of list of token ids that are not allowed to be generated. Check
+            [`~generation.NoBadWordsLogitsProcessor`] for further documentation and examples.
+        force_words_ids (`List[List[int]]` or `List[List[List[int]]]`, *optional*):
+            List of token ids that must be generated. If given a `List[List[int]]`, this is treated as a simple list of
+            words that must be included, the opposite to `bad_words_ids`. If given `List[List[List[int]]]`, this
+            triggers a [disjunctive constraint](https://github.com/huggingface/transformers/issues/14081), where one
+            can allow different forms of each word.
+        renormalize_logits (`bool`, *optional*, defaults to `False`):
+            Whether to renormalize the logits after applying all the logits processors (including the custom
+            ones). It's highly recommended to set this flag to `True` as the search algorithms suppose the score logits
+            are normalized but some logit processors break the normalization.
+        constraints (`List[Constraint]`, *optional*):
+            Custom constraints that can be added to the generation to ensure that the output will contain the use of
+            certain tokens as defined by `Constraint` objects, in the most sensible way possible.
+        forced_bos_token_id (`int`, *optional*, defaults to `model.config.forced_bos_token_id`):
+            The id of the token to force as the first generated token after the `decoder_start_token_id`. Useful for
+            multilingual models like [mBART](../model_doc/mbart) where the first generated token needs to be the target
+            language token.
+        forced_eos_token_id (`int` or List[int]`, *optional*, defaults to `model.config.forced_eos_token_id`):
+            The id of the token to force as the last generated token when `max_length` is reached. Optionally, use a
+            list to set multiple *end-of-sequence* tokens.
+        remove_invalid_values (`bool`, *optional*, defaults to `model.config.remove_invalid_values`):
+            Whether to remove possible *nan* and *inf* outputs of the model to prevent the generation method to crash.
+            Note that using `remove_invalid_values` can slow down generation.
+        exponential_decay_length_penalty (`tuple(int, float)`, *optional*):
+            This Tuple adds an exponentially increasing length penalty, after a certain amount of tokens have been
+            generated. The tuple shall consist of: `(start_index, decay_factor)` where `start_index` indicates where
+            penalty starts and `decay_factor` represents the factor of exponential decay
+        suppress_tokens (`List[int]`, *optional*):
+            A list of tokens that will be suppressed at generation. The `SupressTokens` logit processor will set their
+            log probs to `-inf` so that they are not sampled.
+        begin_suppress_tokens  (`List[int]`, *optional*):
+            A list of tokens that will be suppressed at the beginning of the generation. The `SupressBeginTokens` logit
+            processor will set their log probs to `-inf` so that they are not sampled.
+        forced_decoder_ids (`List[List[int]]`, *optional*):
+            A list of pairs of integers which indicates a mapping from generation indices to token indices that will be
+            forced before sampling. For example, `[[1, 123]]` means the second generated token will always be a token
+            of index 123.
+        sequence_bias (`Dict[Tuple[int], float]`, *optional*)):
+            Dictionary that maps a sequence of tokens to its bias term. Positive biases increase the odds of the
+            sequence being selected, while negative biases do the opposite. Check
+            [`~generation.SequenceBiasLogitsProcessor`] for further documentation and examples.
+        token_healing (`bool`, *optional*, defaults to `False`):
+            Heal tail tokens of prompts by replacing them with their appropriate extensions.
+            This enhances the quality of completions for prompts affected by greedy tokenization bias.
+        guidance_scale (`float`, *optional*):
+            The guidance scale for classifier free guidance (CFG). CFG is enabled by setting `guidance_scale > 1`.
+            Higher guidance scale encourages the model to generate samples that are more closely linked to the input
+            prompt, usually at the expense of poorer quality.
+        low_memory (`bool`, *optional*):
+            Switch to sequential beam search and sequential topk for contrastive search to reduce peak memory.
+            Used with beam search and contrastive search.
+        watermarking_config (`BaseWatermarkingConfig` or `dict`, *optional*):
+            Arguments used to watermark the model outputs by adding a small bias to randomly selected set of "green"
+            tokens. See the docs of [`SynthIDTextWatermarkingConfig`] and [`WatermarkingConfig`] for more
+            details. If passed as `Dict`, it will be converted to a `WatermarkingConfig` internally.
+
+        > Parameters that define the output variables of generate
+
+        num_return_sequences (`int`, *optional*, defaults to 1):
+            The number of independently computed returned sequences for each element in the batch.
+        output_attentions (`bool`, *optional*, defaults to `False`):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more details.
+        output_hidden_states (`bool`, *optional*, defaults to `False`):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more details.
+        output_scores (`bool`, *optional*, defaults to `False`):
+            Whether or not to return the prediction scores. See `scores` under returned tensors for more details.
+        output_logits (`bool`, *optional*):
+            Whether or not to return the unprocessed prediction logit scores. See `logits` under returned tensors for
+            more details.
+        return_dict_in_generate (`bool`, *optional*, defaults to `False`):
+            Whether or not to return a [`~utils.ModelOutput`], as opposed to returning exclusively the generated
+            sequence. This flag must be set to `True` to return the generation cache (when `use_cache` is `True`)
+            or optional outputs (see flags starting with `output_`)
+
+        > Special tokens that can be used at generation time
+
+        pad_token_id (`int`, *optional*):
+            The id of the *padding* token.
+        bos_token_id (`int`, *optional*):
+            The id of the *beginning-of-sequence* token.
+        eos_token_id (`Union[int, List[int]]`, *optional*):
+            The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
+
+        > Generation parameters exclusive to encoder-decoder models
+
+        encoder_no_repeat_ngram_size (`int`, *optional*, defaults to 0):
+            If set to int > 0, all ngrams of that size that occur in the `encoder_input_ids` cannot occur in the
+            `decoder_input_ids`.
+        decoder_start_token_id (`int` or `List[int]`, *optional*):
+            If an encoder-decoder model starts decoding with a different token than *bos*, the id of that token or a list of length
+            `batch_size`. Indicating a list enables different start ids for each element in the batch
+            (e.g. multilingual models with different target languages in one batch)
+
+        > Generation parameters exclusive to assistant generation
+        is_assistant (`bool`, *optional*, defaults to `False`):
+            Whether the model is an assistant (draft) model.
+        num_assistant_tokens (`int`, *optional*, defaults to 20):
+            Defines the number of _speculative tokens_ that shall be generated by the assistant model before being
+            checked by the target model at each iteration. Higher values for `num_assistant_tokens` make the generation
+            more _speculative_ : If the assistant model is performant larger speed-ups can be reached, if the assistant
+            model requires lots of corrections, lower speed-ups are reached.
+        num_assistant_tokens_schedule (`str`, *optional*, defaults to `"constant"`):
+            Defines the schedule at which max assistant tokens shall be changed during inference.
+            - `"heuristic"`: When all speculative tokens are correct, increase `num_assistant_tokens` by 2 else
+              reduce by 1. `num_assistant_tokens` value is persistent over multiple generation calls with the same assistant model.
+            - `"heuristic_transient"`: Same as `"heuristic"` but `num_assistant_tokens` is reset to its initial value after each generation call.
+            - `"constant"`: `num_assistant_tokens` stays unchanged during generation
+        assistant_confidence_threshold (`float`, *optional*, defaults to 0.4):
+            The confidence threshold for the assistant model. If the assistant model's confidence in its prediction for the current token is lower
+            than this threshold, the assistant model stops the current token generation iteration, even if the number of _speculative tokens_
+            (defined by `num_assistant_tokens`) is not yet reached. The assistant's confidence threshold is adjusted throughout the speculative iterations to reduce the number of unnecessary draft and target forward passes, biased towards avoiding false negatives.
+            `assistant_confidence_threshold` value is persistent over multiple generation calls with the same assistant model.
+            It is an unsupervised version of the dynamic speculation lookahead
+            from Dynamic Speculation Lookahead Accelerates Speculative Decoding of Large Language Models <https://arxiv.org/abs/2405.04304>.
+        prompt_lookup_num_tokens (`int`, *optional*):
+            The number of tokens to be output as candidate tokens.
+        max_matching_ngram_size (`int`, *optional*):
+            The maximum ngram size to be considered for matching in the prompt. Default to 2 if not provided.
+        assistant_early_exit(`int`, *optional*):
+            If set to a positive integer, early exit of the model will be used as an assistant. Can only be used with
+            models that support early exit (i.e. models where logits from intermediate layers can be interpreted by the LM head).
+        assistant_lookbehind(`int`, *optional*, defaults to 10):
+            If set to a positive integer, the re-encodeing process will additionally consider the last `assistant_lookbehind` assistant tokens
+            to correctly align tokens. Can only be used with different tokenizers in speculative decoding.
+            See this [blog](https://huggingface.co/blog/universal_assisted_generation) for more details.
+        target_lookbehind(`int`, *optional*, defaults to 10):
+            If set to a positive integer, the re-encodeing process will additionally consider the last `target_lookbehind` target tokens
+            to correctly align tokens. Can only be used with different tokenizers in speculative decoding.
+            See this [blog](https://huggingface.co/blog/universal_assisted_generation) for more details.
+
+        > Parameters related to performances and compilation
+
+        compile_config (CompileConfig, *optional*):
+            If using a static cache, this controls how `generate` will `compile` the forward pass for performance
+            gains.
+
+        > Wild card
+
+        generation_kwargs:
+            Additional generation kwargs will be forwarded to the `generate` function of the model. Kwargs that are not
+            present in `generate`'s signature will be used in the model forward pass.
+    """
+
+    extra_output_flags = ("output_attentions", "output_hidden_states", "output_scores", "output_logits")
+
+    def __init__(self, **kwargs):
+        # Parameters that control the length of the output
+        self.max_length = kwargs.pop("max_length", 20)
+        self.max_new_tokens = kwargs.pop("max_new_tokens", None)
+        self.min_length = kwargs.pop("min_length", 0)
+        self.min_new_tokens = kwargs.pop("min_new_tokens", None)
+        self.early_stopping = kwargs.pop("early_stopping", False)
+        self.max_time = kwargs.pop("max_time", None)
+        self.stop_strings = kwargs.pop("stop_strings", None)
+
+        # Parameters that control the generation strategy used
+        self.do_sample = kwargs.pop("do_sample", False)
+        self.num_beams = kwargs.pop("num_beams", 1)
+        self.num_beam_groups = kwargs.pop("num_beam_groups", 1)
+        self.penalty_alpha = kwargs.pop("penalty_alpha", None)
+        self.dola_layers = kwargs.pop("dola_layers", None)
+
+        # Parameters that control the cache
+        self.use_cache = kwargs.pop("use_cache", True)
+        self.cache_implementation = kwargs.pop("cache_implementation", None)
+        self.cache_config = kwargs.pop("cache_config", None)
+        if self.cache_implementation is not None and self.cache_implementation in NEEDS_CACHE_CONFIG:
+            cache_config_class = NEEDS_CACHE_CONFIG[self.cache_implementation]
+            if self.cache_config is None:
+                self.cache_config = cache_config_class()
+            elif isinstance(self.cache_config, dict):
+                self.cache_config = cache_config_class.from_dict(self.cache_config)
+        self.return_legacy_cache = kwargs.pop("return_legacy_cache", None)
+
+        # Parameters for manipulation of the model output logits
+        self.temperature = kwargs.pop("temperature", 1.0)
+        self.top_k = kwargs.pop("top_k", 50)
+        self.top_p = kwargs.pop("top_p", 1.0)
+        self.min_p = kwargs.pop("min_p", None)
+        self.typical_p = kwargs.pop("typical_p", 1.0)
+        self.epsilon_cutoff = kwargs.pop("epsilon_cutoff", 0.0)
+        self.eta_cutoff = kwargs.pop("eta_cutoff", 0.0)
+        self.diversity_penalty = kwargs.pop("diversity_penalty", 0.0)
+        self.repetition_penalty = kwargs.pop("repetition_penalty", 1.0)
+        self.encoder_repetition_penalty = kwargs.pop("encoder_repetition_penalty", 1.0)
+        self.length_penalty = kwargs.pop("length_penalty", 1.0)
+        self.no_repeat_ngram_size = kwargs.pop("no_repeat_ngram_size", 0)
+        self.bad_words_ids = kwargs.pop("bad_words_ids", None)
+        self.force_words_ids = kwargs.pop("force_words_ids", None)
+        self.renormalize_logits = kwargs.pop("renormalize_logits", False)
+        self.constraints = kwargs.pop("constraints", None)
+        self.forced_bos_token_id = kwargs.pop("forced_bos_token_id", None)
+        self.forced_eos_token_id = kwargs.pop("forced_eos_token_id", None)
+        self.remove_invalid_values = kwargs.pop("remove_invalid_values", False)
+        self.exponential_decay_length_penalty = kwargs.pop("exponential_decay_length_penalty", None)
+        self.suppress_tokens = kwargs.pop("suppress_tokens", None)
+        self.begin_suppress_tokens = kwargs.pop("begin_suppress_tokens", None)
+        self.forced_decoder_ids = kwargs.pop("forced_decoder_ids", None)
+        self.sequence_bias = kwargs.pop("sequence_bias", None)
+        self.token_healing = kwargs.pop("token_healing", False)
+        self.guidance_scale = kwargs.pop("guidance_scale", None)
+        self.low_memory = kwargs.pop("low_memory", None)
+        watermarking_config = kwargs.pop("watermarking_config", None)
+        if watermarking_config is None:
+            self.watermarking_config = None
+        elif isinstance(watermarking_config, BaseWatermarkingConfig):
+            self.watermarking_config = watermarking_config
+        else:
+            self.watermarking_config = WatermarkingConfig.from_dict(watermarking_config)
+
+        # Parameters that define the output variables of `generate`
+        self.num_return_sequences = kwargs.pop("num_return_sequences", 1)
+        self.output_attentions = kwargs.pop("output_attentions", False)
+        self.output_hidden_states = kwargs.pop("output_hidden_states", False)
+        self.output_scores = kwargs.pop("output_scores", False)
+        self.output_logits = kwargs.pop("output_logits", None)
+        self.return_dict_in_generate = kwargs.pop("return_dict_in_generate", False)
+
+        # Special tokens that can be used at generation time
+        self.pad_token_id = kwargs.pop("pad_token_id", None)
+        self.bos_token_id = kwargs.pop("bos_token_id", None)
+        self.eos_token_id = kwargs.pop("eos_token_id", None)
+
+        # Generation parameters exclusive to encoder-decoder models
+        self.encoder_no_repeat_ngram_size = kwargs.pop("encoder_no_repeat_ngram_size", 0)
+        self.decoder_start_token_id = kwargs.pop("decoder_start_token_id", None)
+
+        # Assistant generation
+        self.is_assistant = False
+        self.num_assistant_tokens = kwargs.pop("num_assistant_tokens", 20)
+        self.num_assistant_tokens_schedule = kwargs.pop("num_assistant_tokens_schedule", "constant")
+        self.assistant_confidence_threshold = kwargs.pop("assistant_confidence_threshold", 0.4)
+        self.prompt_lookup_num_tokens = kwargs.pop("prompt_lookup_num_tokens", None)
+        self.max_matching_ngram_size = kwargs.pop("max_matching_ngram_size", None)
+        self.assistant_early_exit = kwargs.pop("assistant_early_exit", None)
+        ## assistant generation for different tokenizers, the windows size for assistant/target model
+        self.assistant_lookbehind = kwargs.pop("assistant_lookbehind", 10)
+        self.target_lookbehind = kwargs.pop("target_lookbehind", 10)
+
+        # Performances
+        self.compile_config = kwargs.pop("compile_config", CompileConfig())
+
+        # Wild card
+        self.generation_kwargs = kwargs.pop("generation_kwargs", {})
+
+        # The remaining attributes do not parametrize `.generate()`, but are informative and/or used by the hub
+        # interface.
+        self._from_model_config = kwargs.pop("_from_model_config", False)
+        self._commit_hash = kwargs.pop("_commit_hash", None)
+        self.transformers_version = kwargs.pop("transformers_version", __version__)
+
+        # Additional attributes without default values
+        if not self._from_model_config:
+            # we don't want to copy values from the model config if we're initializing a `GenerationConfig` from a
+            # model's default configuration file
+            for key, value in kwargs.items():
+                try:
+                    setattr(self, key, value)
+                except AttributeError as err:
+                    logger.error(f"Can't set {key} with value {value} for {self}")
+                    raise err
+
+        # Validate the values of the attributes
+        self.validate(is_init=True)
+
+    def __hash__(self):
+        return hash(self.to_json_string(ignore_metadata=True))
+
+    def __eq__(self, other):
+        if not isinstance(other, GenerationConfig):
+            return False
+
+        self_without_metadata = self.to_json_string(use_diff=False, ignore_metadata=True)
+        other_without_metadata = other.to_json_string(use_diff=False, ignore_metadata=True)
+        return self_without_metadata == other_without_metadata
+
+    def __repr__(self):
+        return f"{self.__class__.__name__} {self.to_json_string(ignore_metadata=True)}"
+
+    def get_generation_mode(self, assistant_model: Optional["PreTrainedModel"] = None) -> GenerationMode:
+        """
+        Returns the generation mode triggered by the [`GenerationConfig`] instance.
+
+        Arg:
+            assistant_model (`PreTrainedModel`, *optional*):
+                The assistant model to be used for assisted generation. If set, the generation mode will be
+                assisted generation.
+
+        Returns:
+            `GenerationMode`: The generation mode triggered by the instance.
+        """
+        # TODO joao: find out a way of not depending on external fields (e.g. `assistant_model`), then make this a
+        # property and part of the `__repr__`
+        if self.constraints is not None or self.force_words_ids is not None:
+            generation_mode = GenerationMode.CONSTRAINED_BEAM_SEARCH
+        elif self.num_beams == 1:
+            if self.do_sample is False:
+                if (
+                    self.top_k is not None
+                    and self.top_k > 1
+                    and self.penalty_alpha is not None
+                    and self.penalty_alpha > 0
+                ):
+                    generation_mode = GenerationMode.CONTRASTIVE_SEARCH
+                else:
+                    generation_mode = GenerationMode.GREEDY_SEARCH
+            else:
+                generation_mode = GenerationMode.SAMPLE
+        else:
+            if self.num_beam_groups > 1:
+                generation_mode = GenerationMode.GROUP_BEAM_SEARCH
+            elif self.do_sample is True:
+                generation_mode = GenerationMode.BEAM_SAMPLE
+            else:
+                generation_mode = GenerationMode.BEAM_SEARCH
+
+        # Assisted generation may extend some generation modes
+        if (
+            assistant_model is not None
+            or self.prompt_lookup_num_tokens is not None
+            or self.assistant_early_exit is not None
+        ):
+            if generation_mode in ("greedy_search", "sample"):
+                generation_mode = GenerationMode.ASSISTED_GENERATION
+            else:
+                raise ValueError(
+                    "You've set `assistant_model`, which triggers assisted generate. Currently, assisted generate "
+                    "is only supported with Greedy Search and Sample."
+                )
+
+        # DoLa generation may extend some generation modes
+        if self.dola_layers is not None:
+            if generation_mode in ("greedy_search", "sample"):
+                generation_mode = GenerationMode.DOLA_GENERATION
+            else:
+                raise ValueError(
+                    "You've set `dola_layers`, which triggers DoLa generate. Currently, DoLa generate "
+                    "is only supported with Greedy Search and Sample."
+                )
+        return generation_mode
+
+    def validate(self, is_init=False):
+        """
+        Validates the values of the attributes of the [`GenerationConfig`] instance. Raises exceptions in the presence
+        of parameterization that can be detected as incorrect from the configuration instance alone.
+
+        Note that some parameters not validated here are best validated at generate runtime, as they may depend on
+        other inputs and/or the model, such as parameters related to the generation length.
+
+        Arg:
+            is_init (`bool`, *optional*, defaults to `False`):
+                Whether the validation is performed during the initialization of the instance.
+        """
+
+        # Validation of individual attributes
+        if self.early_stopping not in {True, False, "never"}:
+            raise ValueError(f"`early_stopping` must be a boolean or 'never', but is {self.early_stopping}.")
+        if self.max_new_tokens is not None and self.max_new_tokens <= 0:
+            raise ValueError(f"`max_new_tokens` must be greater than 0, but is {self.max_new_tokens}.")
+        if self.pad_token_id is not None and self.pad_token_id < 0:
+            warnings.warn(
+                f"`pad_token_id` should be positive but got {self.pad_token_id}. This will cause errors when batch "
+                "generating, if there is padding. Please set `pad_token_id` explicitly as "
+                "`model.generation_config.pad_token_id=PAD_TOKEN_ID` to avoid errors in generation"
+            )
+
+        # Validation of attribute relations:
+        fix_location = ""
+        if is_init:
+            fix_location = (
+                " This was detected when initializing the generation config instance, which means the corresponding "
+                "file may hold incorrect parameterization and should be fixed."
+            )
+
+        # 1. detect sampling-only parameterization when not in sampling mode
+        if self.do_sample is False:
+            greedy_wrong_parameter_msg = (
+                "`do_sample` is set to `False`. However, `{flag_name}` is set to `{flag_value}` -- this flag is only "
+                "used in sample-based generation modes. You should set `do_sample=True` or unset `{flag_name}`."
+                + fix_location
+            )
+            if self.temperature is not None and self.temperature != 1.0:
+                warnings.warn(
+                    greedy_wrong_parameter_msg.format(flag_name="temperature", flag_value=self.temperature),
+                    UserWarning,
+                )
+            if self.top_p is not None and self.top_p != 1.0:
+                warnings.warn(
+                    greedy_wrong_parameter_msg.format(flag_name="top_p", flag_value=self.top_p),
+                    UserWarning,
+                )
+            if self.min_p is not None:
+                warnings.warn(
+                    greedy_wrong_parameter_msg.format(flag_name="min_p", flag_value=self.min_p),
+                    UserWarning,
+                )
+            if self.typical_p is not None and self.typical_p != 1.0:
+                warnings.warn(
+                    greedy_wrong_parameter_msg.format(flag_name="typical_p", flag_value=self.typical_p),
+                    UserWarning,
+                )
+            if (
+                self.top_k is not None and self.top_k != 50 and self.penalty_alpha is None
+            ):  # contrastive search uses top_k
+                warnings.warn(
+                    greedy_wrong_parameter_msg.format(flag_name="top_k", flag_value=self.top_k),
+                    UserWarning,
+                )
+            if self.epsilon_cutoff is not None and self.epsilon_cutoff != 0.0:
+                warnings.warn(
+                    greedy_wrong_parameter_msg.format(flag_name="epsilon_cutoff", flag_value=self.epsilon_cutoff),
+                    UserWarning,
+                )
+            if self.eta_cutoff is not None and self.eta_cutoff != 0.0:
+                warnings.warn(
+                    greedy_wrong_parameter_msg.format(flag_name="eta_cutoff", flag_value=self.eta_cutoff),
+                    UserWarning,
+                )
+
+        # 2. detect beam-only parameterization when not in beam mode
+        if self.num_beams is None:
+            warnings.warn("`num_beams` is set to None - defaulting to 1.", UserWarning)
+            self.num_beams = 1
+
+        if self.num_beams == 1:
+            single_beam_wrong_parameter_msg = (
+                "`num_beams` is set to 1. However, `{flag_name}` is set to `{flag_value}` -- this flag is only used "
+                "in beam-based generation modes. You should set `num_beams>1` or unset `{flag_name}`." + fix_location
+            )
+            if self.early_stopping is not False:
+                warnings.warn(
+                    single_beam_wrong_parameter_msg.format(flag_name="early_stopping", flag_value=self.early_stopping),
+                    UserWarning,
+                )
+            if self.num_beam_groups is not None and self.num_beam_groups != 1:
+                warnings.warn(
+                    single_beam_wrong_parameter_msg.format(
+                        flag_name="num_beam_groups", flag_value=self.num_beam_groups
+                    ),
+                    UserWarning,
+                )
+            if self.diversity_penalty is not None and self.diversity_penalty != 0.0:
+                warnings.warn(
+                    single_beam_wrong_parameter_msg.format(
+                        flag_name="diversity_penalty", flag_value=self.diversity_penalty
+                    ),
+                    UserWarning,
+                )
+            if self.length_penalty is not None and self.length_penalty != 1.0:
+                warnings.warn(
+                    single_beam_wrong_parameter_msg.format(flag_name="length_penalty", flag_value=self.length_penalty),
+                    UserWarning,
+                )
+            if self.constraints is not None:
+                warnings.warn(
+                    single_beam_wrong_parameter_msg.format(flag_name="constraints", flag_value=self.constraints),
+                    UserWarning,
+                )
+
+        # 3. detect incorrect paramaterization specific to advanced beam modes
+        else:
+            # constrained beam search
+            if self.constraints is not None or self.force_words_ids is not None:
+                constrained_wrong_parameter_msg = (
+                    "one of `constraints`, `force_words_ids` is not `None`, triggering constrained beam search. However, "
+                    "`{flag_name}` is set to `{flag_value}`, which is incompatible with this generation mode. Set "
+                    "`constraints` and `force_words_ids` to `None` or unset `{flag_name}` to continue." + fix_location
+                )
+                if self.do_sample is True:
+                    raise ValueError(
+                        constrained_wrong_parameter_msg.format(flag_name="do_sample", flag_value=self.do_sample)
+                    )
+                if self.num_beam_groups is not None and self.num_beam_groups != 1:
+                    raise ValueError(
+                        constrained_wrong_parameter_msg.format(
+                            flag_name="num_beam_groups", flag_value=self.num_beam_groups
+                        )
+                    )
+            # group beam search
+            if self.diversity_penalty != 0.0 or self.num_beam_groups != 1:
+                group_error_prefix = (
+                    "`diversity_penalty` is not 0.0 or `num_beam_groups` is not 1, triggering group beam search. In "
+                    "this generation mode, "
+                )
+                if self.do_sample is True:
+                    raise ValueError(group_error_prefix + "`do_sample` must be set to `False`")
+                if self.num_beams % self.num_beam_groups != 0:
+                    raise ValueError(group_error_prefix + "`num_beams` should be divisible by `num_beam_groups`")
+                if self.diversity_penalty == 0.0:
+                    raise ValueError(
+                        group_error_prefix
+                        + "`diversity_penalty` should be greater than `0.0`, otherwise your groups will be identical."
+                    )
+            # DoLa generation
+            if self.dola_layers is not None and (self.repetition_penalty is None or self.repetition_penalty < 1.2):
+                warnings.warn(
+                    "`dola_layers` is set to trigger DoLa decoding, but `repetition_penalty` is set to a value of "
+                    f"{self.repetition_penalty}, which could induce unwanted repetition. The recommended value for "
+                    "DoLa decoding is `repetition_penalty>=1.2`.",
+                    UserWarning,
+                )
+
+        # 4. check `num_return_sequences`
+        if self.num_return_sequences != 1:
+            if self.num_beams == 1:
+                if self.do_sample is False:
+                    raise ValueError(
+                        "Greedy methods without beam search do not support `num_return_sequences` different than 1 "
+                        f"(got {self.num_return_sequences})."
+                    )
+            elif self.num_return_sequences > self.num_beams:
+                raise ValueError(
+                    f"`num_return_sequences` ({self.num_return_sequences}) has to be smaller or equal to `num_beams` "
+                    f"({self.num_beams})."
+                )
+
+        # 5. check cache-related arguments
+        if self.cache_implementation is not None and self.cache_implementation not in ALL_CACHE_IMPLEMENTATIONS:
+            raise ValueError(
+                f"Invalid `cache_implementation` ({self.cache_implementation}). Choose one of: "
+                f"{ALL_CACHE_IMPLEMENTATIONS}"
+            )
+        if self.cache_config is not None:
+            cache_class = NEEDS_CACHE_CONFIG.get(self.cache_implementation)
+            if cache_class is None:
+                raise ValueError(
+                    "You provided a `cache_config` but the cache implementation you are using "
+                    f"({self.cache_implementation}) does not require any config. Make sure to use the "
+                    "correct cache implementation matching your cache config."
+                )
+            if not isinstance(self.cache_config, cache_class):
+                self.cache_config = cache_class.from_dict(self.cache_config)
+            self.cache_config.validate()
+        if self.use_cache is False:
+            # In this case, all cache-related arguments should be unset. However, since `use_cache=False` is often used
+            # passed to `generate` directly to hot-fix cache issues, let's raise a warning instead of an error
+            # (otherwise a user might need to overwrite several parameters).
+            no_cache_warning = (
+                "You have set `use_cache` to `False`, but {cache_arg} is set to {cache_arg_value}. {cache_arg} will "
+                "have no effect."
+            )
+            for arg_name in ("cache_implementation", "cache_config", "return_legacy_cache"):
+                if getattr(self, arg_name) is not None:
+                    logger.warning_once(
+                        no_cache_warning.format(cache_arg=arg_name, cache_arg_value=getattr(self, arg_name)),
+                        UserWarning,
+                    )
+
+        # 6.  check watermarking arguments
+        if self.watermarking_config is not None:
+            if not (
+                isinstance(self.watermarking_config, WatermarkingConfig)
+                or isinstance(self.watermarking_config, SynthIDTextWatermarkingConfig)
+            ):
+                warnings.warn(
+                    "`watermarking_config` as a dict is deprecated. Please construct `watermarking_config` object with "
+                    "`WatermarkingConfig` or `SynthIDTextWatermarkingConfig` class.",
+                    FutureWarning,
+                )
+                self.watermarking_config = WatermarkingConfig.from_dict(self.watermarking_config)
+            self.watermarking_config.validate()
+
+        # 7. performances arguments
+        if not isinstance(self.compile_config, CompileConfig):
+            raise ValueError(
+                f"You provided `compile_config` as an instance of {type(self.compile_config)}, but it must be an instance of `CompileConfig`."
+            )
+
+        # 8. other incorrect combinations
+        if self.return_dict_in_generate is not True:
+            for extra_output_flag in self.extra_output_flags:
+                if getattr(self, extra_output_flag) is True:
+                    warnings.warn(
+                        f"`return_dict_in_generate` is NOT set to `True`, but `{extra_output_flag}` is. When "
+                        f"`return_dict_in_generate` is not `True`, `{extra_output_flag}` is ignored.",
+                        UserWarning,
+                    )
+
+        # 8. check common issue: passing `generate` arguments inside the generation config
+        generate_arguments = (
+            "logits_processor",
+            "stopping_criteria",
+            "prefix_allowed_tokens_fn",
+            "synced_gpus",
+            "assistant_model",
+            "streamer",
+            "negative_prompt_ids",
+            "negative_prompt_attention_mask",
+        )
+        for arg in generate_arguments:
+            if hasattr(self, arg):
+                raise ValueError(
+                    f"Argument `{arg}` is not a valid argument of `GenerationConfig`. It should be passed to "
+                    "`generate()` (or a pipeline) directly."
+                )
+
+    def save_pretrained(
+        self,
+        save_directory: Union[str, os.PathLike],
+        config_file_name: Optional[Union[str, os.PathLike]] = None,
+        push_to_hub: bool = False,
+        **kwargs,
+    ):
+        r"""
+        Save a generation configuration object to the directory `save_directory`, so that it can be re-loaded using the
+        [`~GenerationConfig.from_pretrained`] class method.
+
+        Args:
+            save_directory (`str` or `os.PathLike`):
+                Directory where the configuration JSON file will be saved (will be created if it does not exist).
+            config_file_name (`str` or `os.PathLike`, *optional*, defaults to `"generation_config.json"`):
+                Name of the generation configuration JSON file to be saved in `save_directory`.
+            push_to_hub (`bool`, *optional*, defaults to `False`):
+                Whether or not to push your model to the Hugging Face model hub after saving it. You can specify the
+                repository you want to push to with `repo_id` (will default to the name of `save_directory` in your
+                namespace).
+            kwargs (`Dict[str, Any]`, *optional*):
+                Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
+        """
+
+        # At save time, validate the instance -- if any warning/exception is thrown, we refuse to save the instance.
+        # This strictness is enforced to prevent bad configurations from being saved and re-used.
+        try:
+            with warnings.catch_warnings(record=True) as caught_warnings:
+                self.validate()
+            if len(caught_warnings) > 0:
+                raise ValueError(str([w.message for w in caught_warnings]))
+        except ValueError as exc:
+            raise ValueError(
+                "The generation config instance is invalid -- `.validate()` throws warnings and/or exceptions. "
+                "Fix these issues to save the configuration.\n\nThrown during validation:\n" + str(exc)
+            )
+
+        use_auth_token = kwargs.pop("use_auth_token", None)
+
+        if use_auth_token is not None:
+            warnings.warn(
+                "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. "
+                "Please use `token` instead.",
+                FutureWarning,
+            )
+            if kwargs.get("token", None) is not None:
+                raise ValueError(
+                    "`token` and `use_auth_token` are both specified. Please set only the argument `token`."
+                )
+            kwargs["token"] = use_auth_token
+
+        config_file_name = config_file_name if config_file_name is not None else GENERATION_CONFIG_NAME
+
+        if os.path.isfile(save_directory):
+            raise AssertionError(f"Provided path ({save_directory}) should be a directory, not a file")
+
+        os.makedirs(save_directory, exist_ok=True)
+
+        if push_to_hub:
+            commit_message = kwargs.pop("commit_message", None)
+            repo_id = kwargs.pop("repo_id", save_directory.split(os.path.sep)[-1])
+            repo_id = self._create_repo(repo_id, **kwargs)
+            files_timestamps = self._get_files_timestamps(save_directory)
+
+        output_config_file = os.path.join(save_directory, config_file_name)
+
+        self.to_json_file(output_config_file, use_diff=True)
+        logger.info(f"Configuration saved in {output_config_file}")
+
+        if push_to_hub:
+            self._upload_modified_files(
+                save_directory,
+                repo_id,
+                files_timestamps,
+                commit_message=commit_message,
+                token=kwargs.get("token"),
+            )
+
+    @classmethod
+    def from_pretrained(
+        cls,
+        pretrained_model_name: Union[str, os.PathLike],
+        config_file_name: Optional[Union[str, os.PathLike]] = None,
+        cache_dir: Optional[Union[str, os.PathLike]] = None,
+        force_download: bool = False,
+        local_files_only: bool = False,
+        token: Optional[Union[str, bool]] = None,
+        revision: str = "main",
+        **kwargs,
+    ) -> "GenerationConfig":
+        r"""
+        Instantiate a [`GenerationConfig`] from a generation configuration file.
+
+        Args:
+            pretrained_model_name (`str` or `os.PathLike`):
+                This can be either:
+
+                - a string, the *model id* of a pretrained model configuration hosted inside a model repo on
+                  huggingface.co.
+                - a path to a *directory* containing a configuration file saved using the
+                  [`~GenerationConfig.save_pretrained`] method, e.g., `./my_model_directory/`.
+            config_file_name (`str` or `os.PathLike`, *optional*, defaults to `"generation_config.json"`):
+                Name of the generation configuration JSON file to be loaded from `pretrained_model_name`.
+            cache_dir (`str` or `os.PathLike`, *optional*):
+                Path to a directory in which a downloaded pretrained model configuration should be cached if the
+                standard cache should not be used.
+            force_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to force to (re-)download the configuration files and override the cached versions if
+                they exist.
+            resume_download:
+                Deprecated and ignored. All downloads are now resumed by default when possible.
+                Will be removed in v5 of Transformers.
+            proxies (`Dict[str, str]`, *optional*):
+                A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
+                'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
+            token (`str` or `bool`, *optional*):
+                The token to use as HTTP bearer authorization for remote files. If `True`, or not specified, will use
+                the token generated when running `huggingface-cli login` (stored in `~/.huggingface`).
+            revision (`str`, *optional*, defaults to `"main"`):
+                The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
+                git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
+                identifier allowed by git.
+
+                <Tip>
+
+                To test a pull request you made on the Hub, you can pass `revision="refs/pr/<pr_number>"`.
+
+                </Tip>
+
+            return_unused_kwargs (`bool`, *optional*, defaults to `False`):
+                If `False`, then this function returns just the final configuration object.
+
+                If `True`, then this functions returns a `Tuple(config, unused_kwargs)` where *unused_kwargs* is a
+                dictionary consisting of the key/value pairs whose keys are not configuration attributes: i.e., the
+                part of `kwargs` which has not been used to update `config` and is otherwise ignored.
+            subfolder (`str`, *optional*, defaults to `""`):
+                In case the relevant files are located inside a subfolder of the model repo on huggingface.co, you can
+                specify the folder name here.
+            kwargs (`Dict[str, Any]`, *optional*):
+                The values in kwargs of any keys which are configuration attributes will be used to override the loaded
+                values. Behavior concerning key/value pairs whose keys are *not* configuration attributes is controlled
+                by the `return_unused_kwargs` keyword parameter.
+
+        Returns:
+            [`GenerationConfig`]: The configuration object instantiated from this pretrained model.
+
+        Examples:
+
+        ```python
+        >>> from transformers import GenerationConfig
+
+        >>> # Download configuration from huggingface.co and cache.
+        >>> generation_config = GenerationConfig.from_pretrained("openai-community/gpt2")
+
+        >>> # E.g. config was saved using *save_pretrained('./test/saved_model/')*
+        >>> generation_config.save_pretrained("./test/saved_model/")
+        >>> generation_config = GenerationConfig.from_pretrained("./test/saved_model/")
+
+        >>> # You can also specify configuration names to your generation configuration file
+        >>> generation_config.save_pretrained("./test/saved_model/", config_file_name="my_configuration.json")
+        >>> generation_config = GenerationConfig.from_pretrained("./test/saved_model/", "my_configuration.json")
+
+        >>> # If you'd like to try a minor variation to an existing configuration, you can also pass generation
+        >>> # arguments to `.from_pretrained()`. Be mindful that typos and unused arguments will be ignored
+        >>> generation_config, unused_kwargs = GenerationConfig.from_pretrained(
+        ...     "openai-community/gpt2", top_k=1, foo=False, do_sample=True, return_unused_kwargs=True
+        ... )
+        >>> generation_config.top_k
+        1
+
+        >>> unused_kwargs
+        {'foo': False}
+        ```"""
+        config_file_name = config_file_name if config_file_name is not None else GENERATION_CONFIG_NAME
+
+        resume_download = kwargs.pop("resume_download", None)
+        proxies = kwargs.pop("proxies", None)
+        use_auth_token = kwargs.pop("use_auth_token", None)
+        subfolder = kwargs.pop("subfolder", "")
+        from_pipeline = kwargs.pop("_from_pipeline", None)
+        from_auto_class = kwargs.pop("_from_auto", False)
+        commit_hash = kwargs.pop("_commit_hash", None)
+
+        if use_auth_token is not None:
+            warnings.warn(
+                "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.",
+                FutureWarning,
+            )
+            if token is not None:
+                raise ValueError(
+                    "`token` and `use_auth_token` are both specified. Please set only the argument `token`."
+                )
+            token = use_auth_token
+
+        user_agent = {"file_type": "config", "from_auto_class": from_auto_class}
+        if from_pipeline is not None:
+            user_agent["using_pipeline"] = from_pipeline
+
+        config_path = os.path.join(pretrained_model_name, config_file_name)
+        config_path = str(config_path)
+
+        is_local = os.path.exists(config_path)
+        if os.path.isfile(os.path.join(subfolder, config_path)):
+            # Special case when config_path is a local file
+            resolved_config_file = config_path
+            is_local = True
+        elif is_remote_url(config_path):
+            configuration_file = config_path
+            resolved_config_file = download_url(config_path)
+        else:
+            configuration_file = config_file_name
+            try:
+                # Load from local folder or from cache or download from model Hub and cache
+                resolved_config_file = cached_file(
+                    pretrained_model_name,
+                    configuration_file,
+                    cache_dir=cache_dir,
+                    force_download=force_download,
+                    proxies=proxies,
+                    resume_download=resume_download,
+                    local_files_only=local_files_only,
+                    token=token,
+                    user_agent=user_agent,
+                    revision=revision,
+                    subfolder=subfolder,
+                    _commit_hash=commit_hash,
+                )
+                commit_hash = extract_commit_hash(resolved_config_file, commit_hash)
+            except EnvironmentError:
+                # Raise any environment error raise by `cached_file`. It will have a helpful error message adapted to
+                # the original exception.
+                raise
+            except Exception:
+                # For any other exception, we throw a generic error.
+                raise EnvironmentError(
+                    f"Can't load the configuration of '{pretrained_model_name}'. If you were trying to load it"
+                    " from 'https://huggingface.co/models', make sure you don't have a local directory with the same"
+                    f" name. Otherwise, make sure '{pretrained_model_name}' is the correct path to a directory"
+                    f" containing a {configuration_file} file"
+                )
+
+        try:
+            # Load config dict
+            config_dict = cls._dict_from_json_file(resolved_config_file)
+            config_dict["_commit_hash"] = commit_hash
+        except (json.JSONDecodeError, UnicodeDecodeError):
+            raise EnvironmentError(
+                f"It looks like the config file at '{resolved_config_file}' is not a valid JSON file."
+            )
+
+        if is_local:
+            logger.info(f"loading configuration file {resolved_config_file}")
+        else:
+            logger.info(f"loading configuration file {configuration_file} from cache at {resolved_config_file}")
+
+        if kwargs.get("return_unused_kwargs") is True:
+            config, unused_kwargs = cls.from_dict(config_dict, **kwargs)
+            config._original_object_hash = hash(config)  # Hash to detect whether the instance was modified
+            return config, unused_kwargs
+        else:
+            config = cls.from_dict(config_dict, **kwargs)
+            config._original_object_hash = hash(config)  # Hash to detect whether the instance was modified
+            return config
+
+    @classmethod
+    def _dict_from_json_file(cls, json_file: Union[str, os.PathLike]):
+        with open(json_file, "r", encoding="utf-8") as reader:
+            text = reader.read()
+        return json.loads(text)
+
+    @classmethod
+    def from_dict(cls, config_dict: Dict[str, Any], **kwargs) -> "GenerationConfig":
+        """
+        Instantiates a [`GenerationConfig`] from a Python dictionary of parameters.
+
+        Args:
+            config_dict (`Dict[str, Any]`):
+                Dictionary that will be used to instantiate the configuration object.
+            kwargs (`Dict[str, Any]`):
+                Additional parameters from which to initialize the configuration object.
+
+        Returns:
+            [`GenerationConfig`]: The configuration object instantiated from those parameters.
+        """
+        return_unused_kwargs = kwargs.pop("return_unused_kwargs", False)
+        # Those arguments may be passed along for our internal telemetry.
+        # We remove them so they don't appear in `return_unused_kwargs`.
+        kwargs.pop("_from_auto", None)
+        kwargs.pop("_from_pipeline", None)
+        # The commit hash might have been updated in the `config_dict`, we don't want the kwargs to erase that update.
+        if "_commit_hash" in kwargs and "_commit_hash" in config_dict:
+            kwargs["_commit_hash"] = config_dict["_commit_hash"]
+
+        # The line below allows model-specific config to be loaded as well through kwargs, with safety checks.
+        # See https://github.com/huggingface/transformers/pull/21269
+        config = cls(**{**config_dict, **kwargs})
+        unused_kwargs = config.update(**kwargs)
+
+        logger.info(f"Generate config {config}")
+        if return_unused_kwargs:
+            return config, unused_kwargs
+        else:
+            return config
+
+    def dict_torch_dtype_to_str(self, d: Dict[str, Any]) -> None:
+        """
+        Checks whether the passed dictionary and its nested dicts have a *torch_dtype* key and if it's not None,
+        converts torch.dtype to a string of just the type. For example, `torch.float32` get converted into *"float32"*
+        string, which can then be stored in the json format.
+        """
+        if d.get("torch_dtype", None) is not None and not isinstance(d["torch_dtype"], str):
+            d["torch_dtype"] = str(d["torch_dtype"]).split(".")[1]
+        for value in d.values():
+            if isinstance(value, dict):
+                self.dict_torch_dtype_to_str(value)
+
+    def to_diff_dict(self) -> Dict[str, Any]:
+        """
+        Removes all attributes from config which correspond to the default config attributes for better readability and
+        serializes to a Python dictionary.
+
+        Returns:
+            `Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance,
+        """
+        config_dict = self.to_dict()
+
+        # get the default config dict
+        default_config_dict = GenerationConfig().to_dict()
+
+        serializable_config_dict = {}
+
+        # only serialize values that differ from the default config
+        for key, value in config_dict.items():
+            if key not in default_config_dict or key == "transformers_version" or value != default_config_dict[key]:
+                serializable_config_dict[key] = value
+
+        self.dict_torch_dtype_to_str(serializable_config_dict)
+        return serializable_config_dict
+
+    def to_dict(self) -> Dict[str, Any]:
+        """
+        Serializes this instance to a Python dictionary.
+
+        Returns:
+            `Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance.
+        """
+        output = copy.deepcopy(self.__dict__)
+
+        # Fields to ignore at serialization time
+        if "_commit_hash" in output:
+            del output["_commit_hash"]
+        if "_original_object_hash" in output:
+            del output["_original_object_hash"]
+        if "compile_config" in output:
+            del output["compile_config"]
+
+        # Transformers version when serializing this file
+        output["transformers_version"] = __version__
+
+        self.dict_torch_dtype_to_str(output)
+        return output
+
+    def to_json_string(self, use_diff: bool = True, ignore_metadata: bool = False) -> str:
+        """
+        Serializes this instance to a JSON string.
+
+        Args:
+            use_diff (`bool`, *optional*, defaults to `True`):
+                If set to `True`, only the difference between the config instance and the default `GenerationConfig()`
+                is serialized to JSON string.
+            ignore_metadata (`bool`, *optional*, defaults to `False`):
+                Whether to ignore the metadata fields present in the instance
+
+        Returns:
+            `str`: String containing all the attributes that make up this configuration instance in JSON format.
+        """
+        if use_diff is True:
+            config_dict = self.to_diff_dict()
+        else:
+            config_dict = self.to_dict()
+
+        if ignore_metadata:
+            for metadata_field in METADATA_FIELDS:
+                config_dict.pop(metadata_field, None)
+
+        def convert_keys_to_string(obj):
+            if isinstance(obj, dict):
+                return {str(key): convert_keys_to_string(value) for key, value in obj.items()}
+            elif isinstance(obj, list):
+                return [convert_keys_to_string(item) for item in obj]
+            else:
+                return obj
+
+        def convert_dataclass_to_dict(obj):
+            if isinstance(obj, dict):
+                return {key: convert_dataclass_to_dict(value) for key, value in obj.items()}
+            elif is_dataclass(obj):
+                return obj.to_dict()
+            else:
+                return obj
+
+        config_dict = convert_keys_to_string(config_dict)
+        config_dict = convert_dataclass_to_dict(config_dict)
+
+        return json.dumps(config_dict, indent=2, sort_keys=True) + "\n"
+
+    def to_json_file(self, json_file_path: Union[str, os.PathLike], use_diff: bool = True):
+        """
+        Save this instance to a JSON file.
+
+        Args:
+            json_file_path (`str` or `os.PathLike`):
+                Path to the JSON file in which this configuration instance's parameters will be saved.
+            use_diff (`bool`, *optional*, defaults to `True`):
+                If set to `True`, only the difference between the config instance and the default `GenerationConfig()`
+                is serialized to JSON file.
+        """
+        with open(json_file_path, "w", encoding="utf-8") as writer:
+            writer.write(self.to_json_string(use_diff=use_diff))
+
+    @classmethod
+    def from_model_config(cls, model_config: PretrainedConfig) -> "GenerationConfig":
+        """
+        Instantiates a [`GenerationConfig`] from a [`PretrainedConfig`]. This function is useful to convert legacy
+        [`PretrainedConfig`] objects, which may contain generation parameters, into a stand-alone [`GenerationConfig`].
+
+        Args:
+            model_config (`PretrainedConfig`):
+                The model config that will be used to instantiate the generation config.
+
+        Returns:
+            [`GenerationConfig`]: The configuration object instantiated from those parameters.
+        """
+        config_dict = model_config.to_dict()
+        config_dict.pop("_from_model_config", None)
+
+        # Removes all `None` from the model config dict -- this lets the generation config defaults to take hold
+        config_dict = {key: value for key, value in config_dict.items() if value is not None}
+
+        generation_config = cls.from_dict(config_dict, return_unused_kwargs=False, _from_model_config=True)
+
+        # Special case: some models have generation attributes set in the decoder. Use them if still unset in the
+        # generation config (which in turn is defined from the outer attributes of model config).
+        decoder_config = model_config.get_text_config(decoder=True)
+        if decoder_config is not model_config:
+            default_generation_config = GenerationConfig()
+            decoder_config_dict = decoder_config.to_dict()
+            for attr in generation_config.to_dict().keys():
+                is_unset = getattr(generation_config, attr) == getattr(default_generation_config, attr)
+                if attr in decoder_config_dict and is_unset:
+                    setattr(generation_config, attr, decoder_config_dict[attr])
+
+        # If any `output_...` flag is set to `True`, we ensure `return_dict_in_generate` is set to `True`.
+        if generation_config.return_dict_in_generate is False:
+            if any(
+                getattr(generation_config, extra_output_flag, False)
+                for extra_output_flag in generation_config.extra_output_flags
+            ):
+                generation_config.return_dict_in_generate = True
+
+        # Hash to detect whether the instance was modified
+        generation_config._original_object_hash = hash(generation_config)
+        return generation_config
+
+    def update(self, **kwargs):
+        """
+        Updates attributes of this class instance with attributes from `kwargs` if they match existing attributes,
+        returning all the unused kwargs.
+
+        Args:
+            kwargs (`Dict[str, Any]`):
+                Dictionary of attributes to tentatively update this class.
+
+        Returns:
+            `Dict[str, Any]`: Dictionary containing all the key-value pairs that were not used to update the instance.
+        """
+        to_remove = []
+        for key, value in kwargs.items():
+            if hasattr(self, key):
+                setattr(self, key, value)
+                to_remove.append(key)
+
+        # Confirm that the updated instance is still valid
+        self.validate()
+
+        # Remove all the attributes that were updated, without modifying the input dict
+        unused_kwargs = {key: value for key, value in kwargs.items() if key not in to_remove}
+        return unused_kwargs
+
+
+@dataclass
+class BaseWatermarkingConfig(ABC):
+    """Generic watermarking config"""
+
+    @classmethod
+    def from_dict(cls, config_dict, **kwargs):
+        """
+        Constructs a BaseWatermarkingConfig instance from a dictionary of parameters.
+
+        Args:
+            config_dict (Dict[str, Any]): Dictionary containing configuration parameters.
+            **kwargs: Additional keyword arguments to override dictionary values.
+
+        Returns:
+            BaseWatermarkingConfig: Instance of BaseWatermarkingConfig constructed from the dictionary.
+        """
+        config = cls(**config_dict)
+        to_remove = []
+        for key, value in kwargs.items():
+            if hasattr(config, key):
+                setattr(config, key, value)
+                to_remove.append(key)
+        for key in to_remove:
+            kwargs.pop(key, None)
+        return config
+
+    def to_json_file(self, json_file_path: Union[str, os.PathLike]):
+        """
+        Save this instance to a JSON file.
+
+        Args:
+            json_file_path (Union[str, os.PathLike]): Path to the JSON file in which this configuration instance's parameters will be saved.
+        """
+        with open(json_file_path, "w", encoding="utf-8") as writer:
+            config_dict = self.to_dict()
+            json_string = json.dumps(config_dict, indent=2, sort_keys=True) + "\n"
+
+            writer.write(json_string)
+
+    def to_dict(self) -> Dict[str, Any]:
+        """
+        Serializes this instance to a Python dictionary.
+
+        Returns:
+            Dict[str, Any]: Dictionary of all the attributes that make up this configuration instance.
+        """
+        output = copy.deepcopy(self.__dict__)
+        return output
+
+    def __iter__(self):
+        for attr, value in copy.deepcopy(self.__dict__).items():
+            yield attr, value
+
+    def __repr__(self):
+        return f"{self.__class__.__name__} {self.to_json_string()}"
+
+    def to_json_string(self):
+        """
+        Serializes this instance to a JSON formatted string.
+
+        Returns:
+            str: JSON formatted string representing the configuration instance.
+        """
+        return json.dumps(self.__dict__, indent=2) + "\n"
+
+    def update(self, **kwargs):
+        """
+        Update the configuration attributes with new values.
+
+        Args:
+            **kwargs: Keyword arguments representing configuration attributes and their new values.
+        """
+        for key, value in kwargs.items():
+            if hasattr(self, key):
+                setattr(self, key, value)
+
+    @abstractmethod
+    def validate(self): ...
+
+    @abstractmethod
+    def construct_processor(self, vocab_size): ...
+
+
+@dataclass
+class WatermarkingConfig(BaseWatermarkingConfig):
+    """
+    Class that holds arguments for watermark generation and should be passed into `GenerationConfig` during `generate`.
+    See [this paper](https://arxiv.org/abs/2306.04634) for more details on the arguments.
+
+    Accepts the following keys:
+        - greenlist_ratio (`float`):
+            Used for watermarking. The ratio of "green" tokens used to the vocabulary size. Defaults to 0.25.
+        - bias (`float`):
+            Used with watermarking. The bias added to the selected "green" tokens' logits. Defaults to 2.0.
+        - hashing_key (`int`):
+            Hashing key used for watermarking. Defaults to 15485863 (the millionth prime).
+        - seeding_scheme (`str`):
+            Algorithm to use for watermarking. Accepts values:
+                - "lefthash" (default): "green" tokens selection depend on the last token (Algorithm 2 from the paper)
+                - "selfhash": "green" tokens selection depends on the current token itself (Algorithm 3 from the paper)
+                    The downside of this scheme is that it considers all possible next tokens and can be slower than "lefthash".
+        - context_width(`int`):
+            The context length of previous tokens to use in seeding. Higher context length makes watermarking more robust.
+    """
+
+    def __init__(
+        self,
+        greenlist_ratio: Optional[float] = 0.25,
+        bias: Optional[float] = 2.0,
+        hashing_key: Optional[int] = 15485863,
+        seeding_scheme: Optional[str] = "lefthash",
+        context_width: Optional[int] = 1,
+    ):
+        self.greenlist_ratio = greenlist_ratio
+        self.bias = bias
+        self.hashing_key = hashing_key
+        self.seeding_scheme = seeding_scheme
+        self.context_width = context_width
+
+    def validate(self):
+        watermark_missing_arg_msg = (
+            "Some of the keys in `watermarking_config` are defined incorrectly. `{key}` should be {correct_value}` "
+            "but found {found_value}"
+        )
+        if self.seeding_scheme not in ["selfhash", "lefthash"]:
+            raise ValueError(
+                watermark_missing_arg_msg.format(
+                    key="seeding_scheme",
+                    correct_value="[`selfhash`, `lefthash`]",
+                    found_value=self.seeding_scheme,
+                ),
+            )
+        if not 0.0 <= self.greenlist_ratio <= 1.0:
+            raise ValueError(
+                watermark_missing_arg_msg.format(
+                    key="greenlist_ratio",
+                    correct_value="in range between 0.0 and 1.0",
+                    found_value=self.seeding_scheme,
+                ),
+            )
+        if not self.context_width >= 1:
+            raise ValueError(
+                watermark_missing_arg_msg.format(
+                    key="context_width",
+                    correct_value="a positive integer",
+                    found_value=self.context_width,
+                ),
+            )
+
+    def construct_processor(self, vocab_size: int, device) -> "WatermarkLogitsProcessor":
+        return WatermarkLogitsProcessor(
+            vocab_size=vocab_size,
+            device=device,
+            greenlist_ratio=self.greenlist_ratio,
+            bias=self.bias,
+            hashing_key=self.hashing_key,
+            seeding_scheme=self.seeding_scheme,
+            context_width=self.context_width,
+        )
+
+
+@dataclass
+class SynthIDTextWatermarkingConfig(BaseWatermarkingConfig):
+    """
+    Class that holds arguments for watermark generation and should be passed into `GenerationConfig` during `generate`.
+    See [this paper](https://www.nature.com/articles/s41586-024-08025-4) for more details on the arguments.
+
+    Args:
+        ngram_len (`int`):
+            Ngram length.
+        keys (`List[int]`):
+            A sequence of watermarking keys, one for each depth.
+        context_history_size (`int`, *optional*, defaults to 1024):
+            Size of the tensor to keep track of seen contexts.
+        sampling_table_seed (`int`, *optional*, defaults to 0):
+            Random seed to generate the sampling table.
+        sampling_table_size (`int`, *optional*, defaults to 65536):
+            Size of the sampling table.
+        skip_first_ngram_calls (`bool`, *optional*, defaults to `False`):
+            Whether to skip first ngram calls.
+        debug_mode (`bool`, optional, *optional*, defaults to `False`):
+            Logits are modified to uniform one got before watermarking modification is applied. This is to test the
+            implementation.
+
+    Examples:
+    ```python
+    >>> from transformers import AutoModelForCausalLM, AutoTokenizer, SynthIDTextWatermarkingConfig
+
+    >>> tokenizer = AutoTokenizer.from_pretrained('google/gemma-2-2b', padding_side="left")
+    >>> model = AutoModelForCausalLM.from_pretrained('google/gemma-2-2b')
+
+    >>> # SynthID Text configuration
+    >>> watermarking_config = SynthIDTextWatermarkingConfig(
+    ...     keys=[654, 400, 836, 123, 340, 443, 597, 160, 57],
+    ...     ngram_len=5,
+    ... )
+
+    >>> # Generation with watermarking
+    >>> tokenized_prompts = tokenizer(["Once upon a time, "], return_tensors="pt", padding=True)
+    >>> output_sequences = model.generate(
+    ...     **tokenized_prompts, watermarking_config=watermarking_config, do_sample=True, max_new_tokens=10
+    ... )
+    >>> watermarked_text = tokenizer.batch_decode(output_sequences, skip_special_tokens=True)
+    ```
+    """
+
+    def __init__(
+        self,
+        ngram_len: int,
+        keys: List[int],
+        context_history_size: int = 1024,
+        sampling_table_seed: int = 0,
+        sampling_table_size: int = 2**16,
+        skip_first_ngram_calls: bool = False,
+        debug_mode: bool = False,
+    ):
+        self.ngram_len = ngram_len
+        self.keys = keys
+        self.sampling_table_size = sampling_table_size
+        self.sampling_table_seed = sampling_table_seed
+        self.context_history_size = context_history_size
+        self.skip_first_ngram_calls = skip_first_ngram_calls
+        self.debug_mode = debug_mode
+
+    def validate(self):
+        watermark_missing_arg_msg = (
+            "Some of the keys in `watermarking_config` are defined incorrectly. `{key}` should be {correct_value}` "
+            "but found {found_value}"
+        )
+        if self.sampling_table_size > 2**24:
+            raise ValueError(
+                watermark_missing_arg_msg.format(
+                    key="sampling_table_size",
+                    correct_value="< 2**24",
+                    found_value=self.sampling_table_size,
+                ),
+            )
+
+    def construct_processor(self, vocab_size: int, device) -> "WatermarkLogitsProcessor":
+        return SynthIDTextWatermarkLogitsProcessor(
+            ngram_len=self.ngram_len,
+            keys=self.keys,
+            sampling_table_size=self.sampling_table_size,
+            sampling_table_seed=self.sampling_table_seed,
+            context_history_size=self.context_history_size,
+            device=device,
+            skip_first_ngram_calls=self.skip_first_ngram_calls,
+            debug_mode=self.debug_mode,
+        )
+
+
+@dataclass
+class CompileConfig(object):
+    """
+    Class that holds arguments relative to `torch.compile` behavior, when using automatic compilation in `generate`.
+    See [`torch.compile`](https://pytorch.org/docs/stable/generated/torch.compile.html) for more details on the arguments.
+
+    Args:
+        fullgraph (`bool`, *optional*, defaults to `True`):
+            If `True`, requires that the whole forward be capturable in a single graph.
+        dynamic (`bool` or `None`, *optional*):
+            Whether to try to use dynamic shape graphs.
+        backend (`str` or `Callable`, *optional*, defaults to `"inductor"`):
+            Backend to be used.
+        mode (`str`, *optional*, defaults to `"reduce-overhead"`):
+            Controls balance between performance and overhead.
+        options (`dict`, *optional*):
+            A dictionary of options to pass to the backend.
+
+    Examples:
+    ```python
+    >>> from transformers import AutoModelForCausalLM, AutoTokenizer, CompileConfig
+
+    >>> tokenizer = AutoTokenizer.from_pretrained('google/gemma-2-2b')
+    >>> model = AutoModelForCausalLM.from_pretrained('google/gemma-2-2b').cuda()
+
+    >>> # Automatic compile configuration, used with static cache
+    >>> compile_config = CompileConfig(dynamic=True)
+
+    >>> # Generation with static cache and compile config
+    >>> input = tokenizer.encode("Hello there, how", return_tensors="pt").cuda()
+    >>> output = model.generate(
+    ...     input, do_sample=False, max_new_tokens=300, cache_implementation="static", compile_config=compile_config
+    ... )
+    >>> output_text = tokenizer.batch_decode(output, skip_special_tokens=True)[0]
+    ```
+    """
+
+    fullgraph: bool = True
+    dynamic: Optional[bool] = None
+    backend: Union[str, Callable] = "inductor"
+    mode: str = "reduce-overhead"
+    options: Optional[dict] = None
+
+    def to_dict(self) -> Dict[str, Any]:
+        """Serializes this instance to a Python dictionary."""
+        return copy.deepcopy(self.__dict__)
diff --git a/.venv/lib/python3.11/site-packages/transformers/generation/flax_logits_process.py b/.venv/lib/python3.11/site-packages/transformers/generation/flax_logits_process.py
new file mode 100644
index 0000000000000000000000000000000000000000..d106c32defa4db3577d3a72158550d355aa5ece7
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/generation/flax_logits_process.py
@@ -0,0 +1,544 @@
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+
+import jax
+import jax.lax as lax
+import jax.numpy as jnp
+from jax.experimental import sparse
+
+from ..utils import add_start_docstrings
+from ..utils.logging import get_logger
+
+
+logger = get_logger(__name__)
+
+
+LOGITS_PROCESSOR_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`jnp.ndarray` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`PreTrainedTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        scores (`jnp.ndarray` of shape `(batch_size, config.vocab_size)`):
+            Prediction scores of a language modeling head. These can be logits for each vocabulary when not using beam
+            search or log softmax for each vocabulary token when using beam search
+        kwargs (`Dict[str, Any]`, *optional*):
+            Additional logits processor specific kwargs.
+
+    Return:
+        `jnp.ndarray` of shape `(batch_size, config.vocab_size)`: The processed prediction scores.
+
+"""
+
+
+class FlaxLogitsProcessor:
+    """Abstract base class for all logit processors that can be applied during generation."""
+
+    @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
+    def __call__(self, input_ids: jnp.ndarray, scores: jnp.ndarray) -> jnp.ndarray:
+        """Flax method for processing logits."""
+        raise NotImplementedError(
+            f"{self.__class__} is an abstract class. Only classes inheriting this class can be called."
+        )
+
+
+class FlaxLogitsWarper:
+    """Abstract base class for all logit warpers that can be applied during generation with multinomial sampling."""
+
+    @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
+    def __call__(self, input_ids: jnp.ndarray, scores: jnp.ndarray) -> jnp.ndarray:
+        """Flax method for warping logits."""
+        raise NotImplementedError(
+            f"{self.__class__} is an abstract class. Only classes inheriting this class can be called."
+        )
+
+
+class FlaxLogitsProcessorList(list):
+    """
+    This class can be used to create a list of [`FlaxLogitsProcessor`] or [`FlaxLogitsWarper`] to subsequently process
+    a `scores` input tensor. This class inherits from list and adds a specific *__call__* method to apply each
+    [`FlaxLogitsProcessor`] or [`FlaxLogitsWarper`] to the inputs.
+    """
+
+    @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
+    def __call__(self, input_ids: jnp.ndarray, scores: jnp.ndarray, cur_len: int, **kwargs) -> jnp.ndarray:
+        for processor in self:
+            function_args = inspect.signature(processor.__call__).parameters
+            if len(function_args) > 3:
+                if not all(arg in kwargs for arg in list(function_args.keys())[2:]):
+                    raise ValueError(
+                        f"Make sure that all the required parameters: {list(function_args.keys())} for "
+                        f"{processor.__class__} are passed to the logits processor."
+                    )
+                scores = processor(input_ids, scores, cur_len, **kwargs)
+            else:
+                scores = processor(input_ids, scores, cur_len)
+        return scores
+
+
+class FlaxTemperatureLogitsWarper(FlaxLogitsWarper):
+    r"""
+    [`FlaxLogitsWarper`] for temperature (exponential scaling output probability distribution).
+
+    Args:
+        temperature (`float`):
+            The value used to module the logits distribution.
+    """
+
+    def __init__(self, temperature: float):
+        if not isinstance(temperature, float) or not (temperature > 0):
+            raise ValueError(f"`temperature` has to be a strictly positive float, but is {temperature}")
+
+        self.temperature = temperature
+
+    def __call__(self, input_ids: jnp.ndarray, scores: jnp.ndarray, cur_len: int) -> jnp.ndarray:
+        scores = scores / self.temperature
+        return scores
+
+
+class FlaxTopPLogitsWarper(FlaxLogitsWarper):
+    """
+    [`FlaxLogitsWarper`] that performs top-p, i.e. restricting to top tokens summing to prob_cut_off <= prob_cut_off.
+
+    Args:
+        top_p (`float`):
+            If set to < 1, only the smallest set of most probable tokens with probabilities that add up to `top_p` or
+            higher are kept for generation.
+        filter_value (`float`, *optional*, defaults to -inf):
+            All filtered values will be set to this float value.
+        min_tokens_to_keep (`int`, *optional*, defaults to 1):
+            Minimum number of tokens that cannot be filtered.
+    """
+
+    def __init__(self, top_p: float, filter_value: float = -float("Inf"), min_tokens_to_keep: int = 1):
+        if not isinstance(top_p, float) or (top_p < 0 or top_p > 1.0):
+            raise ValueError(f"`top_p` has to be a float > 0 and < 1, but is {top_p}")
+        if not isinstance(min_tokens_to_keep, int) or (min_tokens_to_keep < 1):
+            raise ValueError(f"`min_tokens_to_keep` has to be a positive integer, but is {min_tokens_to_keep}")
+
+        self.top_p = top_p
+        self.filter_value = filter_value
+        self.min_tokens_to_keep = min_tokens_to_keep
+
+    def __call__(self, input_ids: jnp.ndarray, scores: jnp.ndarray, cur_len: int) -> jnp.ndarray:
+        topk_scores, topk_indices = lax.top_k(scores, scores.shape[-1])
+
+        mask_scores = jnp.full_like(scores, self.filter_value)
+        cumulative_probs = jax.nn.softmax(topk_scores, axis=-1).cumsum(axis=-1)
+        score_mask = cumulative_probs < self.top_p
+
+        # include the token that is higher than top_p as well
+        score_mask = jnp.roll(score_mask, 1)
+        score_mask |= score_mask.at[:, 0].set(True)
+
+        # min tokens to keep
+        score_mask = score_mask.at[:, : self.min_tokens_to_keep].set(True)
+
+        topk_next_scores = jnp.where(score_mask, topk_scores, mask_scores)
+        next_scores = jax.lax.sort_key_val(topk_indices, topk_next_scores)[-1]
+
+        return next_scores
+
+
+class FlaxTopKLogitsWarper(FlaxLogitsWarper):
+    r"""
+    [`FlaxLogitsWarper`] that performs top-k, i.e. restricting to the k highest probability elements.
+
+    Args:
+        top_k (`int`):
+            The number of highest probability vocabulary tokens to keep for top-k-filtering.
+        filter_value (`float`, *optional*, defaults to -inf):
+            All filtered values will be set to this float value.
+        min_tokens_to_keep (`int`, *optional*, defaults to 1):
+            Minimum number of tokens that cannot be filtered.
+    """
+
+    def __init__(self, top_k: int, filter_value: float = -float("Inf"), min_tokens_to_keep: int = 1):
+        if not isinstance(top_k, int) or top_k <= 0:
+            raise ValueError(f"`top_k` has to be a strictly positive integer, but is {top_k}")
+
+        self.top_k = max(top_k, min_tokens_to_keep)
+        self.filter_value = filter_value
+
+    def __call__(self, input_ids: jnp.ndarray, scores: jnp.ndarray, cur_len: int) -> jnp.ndarray:
+        batch_size, vocab_size = scores.shape
+        next_scores_flat = jnp.full(batch_size * vocab_size, self.filter_value)
+
+        topk = min(self.top_k, scores.shape[-1])  # Safety check
+        topk_scores, topk_indices = lax.top_k(scores, topk)
+        shift = jnp.broadcast_to((jnp.arange(batch_size) * vocab_size)[:, None], (batch_size, topk)).flatten()
+        topk_scores_flat = topk_scores.flatten()
+        topk_indices_flat = topk_indices.flatten() + shift
+
+        next_scores_flat = next_scores_flat.at[topk_indices_flat].set(topk_scores_flat)
+        next_scores = next_scores_flat.reshape(batch_size, vocab_size)
+        return next_scores
+
+
+class FlaxForcedBOSTokenLogitsProcessor(FlaxLogitsProcessor):
+    r"""
+    [`FlaxLogitsProcessor`] that enforces the specified token as the first generated token.
+
+    Args:
+        bos_token_id (`int`):
+            The id of the token to force as the first generated token.
+    """
+
+    def __init__(self, bos_token_id: int):
+        self.bos_token_id = bos_token_id
+
+    def __call__(self, input_ids: jnp.ndarray, scores: jnp.ndarray, cur_len: int) -> jnp.ndarray:
+        new_scores = jnp.full(scores.shape, -float("inf"))
+
+        apply_penalty = 1 - jnp.bool_(cur_len - 1)
+
+        scores = jnp.where(apply_penalty, new_scores.at[:, self.bos_token_id].set(0), scores)
+
+        return scores
+
+
+class FlaxForcedEOSTokenLogitsProcessor(FlaxLogitsProcessor):
+    r"""
+    [`FlaxLogitsProcessor`] that enforces the specified token as the last generated token when `max_length` is reached.
+
+    Args:
+        max_length (`int`):
+            The maximum length of the sequence to be generated.
+        eos_token_id (`int`):
+            The id of the token to force as the last generated token when `max_length` is reached.
+    """
+
+    def __init__(self, max_length: int, eos_token_id: int):
+        self.max_length = max_length
+        self.eos_token_id = eos_token_id
+
+    def __call__(self, input_ids: jnp.ndarray, scores: jnp.ndarray, cur_len: int) -> jnp.ndarray:
+        new_scores = jnp.full(scores.shape, -float("inf"))
+
+        apply_penalty = 1 - jnp.bool_(cur_len - self.max_length + 1)
+
+        scores = jnp.where(apply_penalty, new_scores.at[:, self.eos_token_id].set(0), scores)
+
+        return scores
+
+
+class FlaxMinLengthLogitsProcessor(FlaxLogitsProcessor):
+    r"""
+    [`FlaxLogitsProcessor`] enforcing a min-length by setting EOS probability to 0.
+
+    Args:
+        min_length (`int`):
+            The minimum length below which the score of `eos_token_id` is set to `-float("Inf")`.
+        eos_token_id (`int`):
+            The id of the *end-of-sequence* token.
+    """
+
+    def __init__(self, min_length: int, eos_token_id: int):
+        if not isinstance(min_length, int) or min_length < 0:
+            raise ValueError(f"`min_length` has to be a positive integer, but is {min_length}")
+
+        if not isinstance(eos_token_id, int) or eos_token_id < 0:
+            raise ValueError(f"`eos_token_id` has to be a positive integer, but is {eos_token_id}")
+
+        self.min_length = min_length
+        self.eos_token_id = eos_token_id
+
+    def __call__(self, input_ids: jnp.ndarray, scores: jnp.ndarray, cur_len: int) -> jnp.ndarray:
+        # create boolean flag to decide if min length penalty should be applied
+        apply_penalty = 1 - jnp.clip(cur_len - self.min_length, 0, 1)
+
+        scores = jnp.where(apply_penalty, scores.at[:, self.eos_token_id].set(-float("inf")), scores)
+
+        return scores
+
+
+class FlaxSuppressTokensAtBeginLogitsProcessor(FlaxLogitsProcessor):
+    r"""
+    [`FlaxLogitsProcessor`] supressing a list of tokens as soon as the `generate` function starts generating using
+    `begin_index` tokens. This should ensure that the tokens defined by `begin_suppress_tokens` are not sampled at the
+    beginning of the generation.
+
+    Args:
+        begin_suppress_tokens (`List[int]`):
+            Tokens to not sample.
+        begin_index (`int`):
+            Index where the tokens are suppressed.
+    """
+
+    def __init__(self, begin_suppress_tokens, begin_index):
+        self.begin_suppress_tokens = list(begin_suppress_tokens)
+        self.begin_index = begin_index
+
+    def __call__(self, input_ids, scores, cur_len: int):
+        apply_penalty = 1 - jnp.bool_(cur_len - self.begin_index)
+
+        scores = jnp.where(apply_penalty, scores.at[:, self.begin_suppress_tokens].set(-float("inf")), scores)
+
+        return scores
+
+
+class FlaxSuppressTokensLogitsProcessor(FlaxLogitsProcessor):
+    r"""
+    [`FlaxLogitsProcessor`] suppressing a list of tokens at each decoding step. The processor will set their log probs
+    to be `-inf` so they are not sampled.
+
+    Args:
+        suppress_tokens (`list`):
+            Tokens to not sample.
+    """
+
+    def __init__(self, suppress_tokens: list):
+        self.suppress_tokens = list(suppress_tokens)
+
+    def __call__(self, input_ids: jnp.ndarray, scores: jnp.ndarray, cur_len: int) -> jnp.ndarray:
+        scores = scores.at[..., self.suppress_tokens].set(-float("inf"))
+
+        return scores
+
+
+class FlaxForceTokensLogitsProcessor(FlaxLogitsProcessor):
+    r"""
+    [`FlaxLogitsProcessor`] that takes a list of pairs of integers which indicates a mapping from generation indices to
+    token indices that will be forced before sampling. The processor will set their log probs to 0 and all other tokens
+    to `-inf` so that they are sampled at their corresponding index.
+
+    Args:
+        force_token_map (`list`):
+            Map giving token ids and indices where they will be forced to be sampled.
+    """
+
+    def __init__(self, force_token_map):
+        force_token_map = dict(force_token_map)
+        # Converts the dictionary of format {index: token} containing the tokens to be forced to an array, where the
+        # index of the array corresponds to the index of the token to be forced, for XLA compatibility.
+        # Indexes without forced tokens will have a negative value.
+        force_token_array = jnp.ones((max(force_token_map.keys()) + 1), dtype=jnp.int32) * -1
+        for index, token in force_token_map.items():
+            if token is not None:
+                force_token_array = force_token_array.at[index].set(token)
+        self.force_token_array = jnp.int32(force_token_array)
+
+    def __call__(self, input_ids: jnp.ndarray, scores: jnp.ndarray, cur_len: int) -> jnp.ndarray:
+        def _force_token(generation_idx):
+            batch_size = scores.shape[0]
+            current_token = self.force_token_array[generation_idx]
+
+            new_scores = jnp.ones_like(scores, dtype=scores.dtype) * -float("inf")
+            updates = jnp.zeros((batch_size, 1), dtype=scores.dtype)
+            new_scores = lax.dynamic_update_slice(new_scores, updates, (0, current_token))
+            return new_scores
+
+        scores = lax.cond(
+            cur_len >= self.force_token_array.shape[0],
+            # If the current length is geq than the length of force_token_array, the processor does nothing.
+            lambda: scores,
+            # Otherwise, it may force a certain token.
+            lambda: lax.cond(
+                self.force_token_array[cur_len] >= 0,
+                # Only valid (positive) tokens are forced
+                lambda: _force_token(cur_len),
+                # Otherwise, the processor does nothing.
+                lambda: scores,
+            ),
+        )
+        return scores
+
+
+class FlaxWhisperTimeStampLogitsProcessor(FlaxLogitsProcessor):
+    r"""
+    Whisper specific Processor. This processor can be used to force a list of tokens. The processor will set their log
+    probs to `inf` so that they are sampled at their corresponding index.
+
+    Args:
+        generate_config (`GenerateConfig`):
+            The generate config used to generate the output. The following parameters are required:
+                eos_token_id (`int`, *optional*, defaults to 50257):
+                    The id of the *end-of-sequence* token.
+                no_timestamps_token_id (`int`, *optional*, defaults to 50363):
+                    The id of the `"<|notimestamps|>"` token.
+                max_initial_timestamp_index (`int`, *optional*, defaults to 1):
+                    Used to set the maximum value of the initial timestamp. This is used to prevent the model from
+                    predicting timestamps that are too far in the future.
+    """
+
+    def __init__(self, generate_config, model_config, decoder_input_length):
+        self.eos_token_id = generate_config.eos_token_id
+        self.no_timestamps_token_id = generate_config.no_timestamps_token_id
+        self.timestamp_begin = generate_config.no_timestamps_token_id + 1
+
+        self.begin_index = decoder_input_length + 1
+
+        if generate_config.is_multilingual:
+            # room for language token and task token
+            self.begin_index += 2
+        if hasattr(generate_config, "max_initial_timestamp_index"):
+            self.max_initial_timestamp_index = generate_config.max_initial_timestamp_index
+        else:
+            self.max_initial_timestamp_index = model_config.vocab_size
+        if self.max_initial_timestamp_index is None:
+            self.max_initial_timestamp_index = model_config.vocab_size
+
+    def __call__(self, input_ids, scores, cur_len):
+        # suppress <|notimestamps|> which is handled by without_timestamps
+        scores = scores.at[:, self.no_timestamps_token_id].set(-float("inf"))
+
+        def handle_pairs(input_ids_k, scores_k):
+            last_was_timestamp = jnp.where((cur_len - self.begin_index) >= 1, True, False)
+            last_was_timestamp = jnp.where(
+                input_ids_k[cur_len - 1] >= self.timestamp_begin,
+                True and last_was_timestamp,
+                False,
+            )
+
+            penultimate_was_timestamp = jnp.where((cur_len - self.begin_index) < 2, True, False)
+            penultimate_was_timestamp = jnp.where(
+                input_ids_k[cur_len - 2] >= self.timestamp_begin,
+                True,
+                penultimate_was_timestamp,
+            )
+
+            return jnp.where(
+                last_was_timestamp,
+                jnp.where(
+                    penultimate_was_timestamp > 0,
+                    scores_k.at[self.timestamp_begin :].set(-float("inf")),
+                    scores_k.at[: self.eos_token_id].set(-float("inf")),
+                ),
+                scores_k,
+            )
+
+        scores = jax.vmap(handle_pairs)(input_ids, scores)
+
+        apply_max_initial_timestamp = jnp.where(cur_len == self.begin_index, True, False)
+        apply_max_initial_timestamp = jnp.where(
+            self.max_initial_timestamp_index is not None,
+            True and apply_max_initial_timestamp,
+            False,
+        )
+
+        last_allowed = self.timestamp_begin + self.max_initial_timestamp_index
+
+        scores = jnp.where(
+            apply_max_initial_timestamp,
+            scores.at[:, last_allowed + 1 :].set(-float("inf")),
+            scores,
+        )
+
+        # if sum of probability over timestamps is above any other token, sample timestamp
+        logprobs = jax.nn.log_softmax(scores, axis=-1)
+
+        def handle_cumulative_probs(logprobs_k, scores_k):
+            timestamp_logprob = jax.nn.logsumexp(logprobs_k[self.timestamp_begin :], axis=-1)
+            max_text_token_logprob = jnp.max(logprobs_k[: self.timestamp_begin])
+            return jnp.where(
+                timestamp_logprob > max_text_token_logprob,
+                scores_k.at[: self.timestamp_begin].set(-float("inf")),
+                scores_k,
+            )
+
+        scores = jax.vmap(handle_cumulative_probs)(logprobs, scores)
+
+        return scores
+
+
+class FlaxNoRepeatNGramLogitsProcessor(FlaxLogitsProcessor):
+    r"""
+    [`FlaxLogitsProcessor`] that enforces no repetition of n-grams. See
+    [Fairseq](https://github.com/pytorch/fairseq/blob/a07cb6f40480928c9e0548b737aadd36ee66ac76/fairseq/sequence_generator.py#L345).
+
+    Args:
+        ngram_size (`int`):
+            All ngrams of size `ngram_size` can only occur once.
+    """
+
+    def __init__(self, ngram_size: int):
+        if not isinstance(ngram_size, int) or ngram_size <= 0:
+            raise ValueError(f"`ngram_size` has to be a strictly positive integer, but is {ngram_size}")
+        self.ngram_size = ngram_size
+
+    def get_previous_ngrams(self, input_ids: jnp.ndarray, vocab_size: int, cur_len: int):
+        """
+        get a matrix of size (batch_size,) + (vocab_size,)*n (for n-grams) that
+        represent the n-grams that occurred previously.
+        The BCOO representation allow to store only the few non-zero entries, instead of the full (huge) matrix
+        """
+        batch_size, seq_len = input_ids.shape
+        # number of n-grams in the whole sequence
+        seq_ngrams = seq_len - (self.ngram_size - 1)
+        # number of n-grams in the currently generated sequence
+        cur_ngrams = cur_len - (self.ngram_size - 1)
+
+        def body_fun(i, val):
+            b = i % batch_size
+            pos = i // batch_size
+            return val.at[i].set(
+                jnp.array(
+                    [
+                        b,
+                    ]
+                    + [jnp.array(input_ids)[b, pos + j] for j in range(self.ngram_size)]
+                )
+            )
+
+        shape = (batch_size * seq_ngrams, self.ngram_size + 1)
+        all_update_indices = jax.lax.fori_loop(
+            0, batch_size * cur_ngrams, body_fun, jnp.zeros(shape, dtype=input_ids.dtype)
+        )
+
+        # ignore the n-grams not yet generated
+        data = (jnp.arange(batch_size * seq_ngrams) < batch_size * cur_ngrams).astype("float32")
+
+        return sparse.BCOO((data, all_update_indices), shape=(batch_size,) + (vocab_size,) * self.ngram_size)
+
+    def get_banned_tokens_mask(self, latest_tokens: jnp.ndarray, previous_ngrams) -> jnp.ndarray:
+        """
+        Determines which tokens must be banned given latest tokens and the previously seen
+        ngrams.
+        """
+
+        @sparse.sparsify
+        @jax.vmap
+        def inner_fn(latest_tokens, previous_ngrams):
+            return previous_ngrams[tuple(latest_tokens)]
+
+        return sparse.bcoo_todense(inner_fn(latest_tokens, previous_ngrams))
+
+    def __call__(self, input_ids: jnp.ndarray, scores: jnp.ndarray, cur_len: int) -> jnp.ndarray:
+        def true_fn():
+            _, vocab_size = scores.shape
+            # store the previously seen n-grams
+            previous_ngrams = self.get_previous_ngrams(input_ids, vocab_size, cur_len)
+
+            # get the n-1 last tokens that prefix the n-gram being generated
+            latest_tokens = jnp.zeros((input_ids.shape[0], self.ngram_size - 1), dtype=input_ids.dtype)
+            latest_tokens = jax.lax.dynamic_update_slice(
+                latest_tokens,
+                jax.lax.dynamic_slice(
+                    input_ids, (0, cur_len - (self.ngram_size - 1)), (input_ids.shape[0], (self.ngram_size - 1))
+                ),
+                (0, 0),
+            )
+
+            # compute the banned tokens, ie all the tokens that when added to the latest tokens lead to a n-gram that was previously generated
+            banned_tokens_indices_mask = self.get_banned_tokens_mask(latest_tokens, previous_ngrams).astype("bool")
+            return jnp.where(banned_tokens_indices_mask, -float("inf"), scores)
+
+        output = jax.lax.cond((cur_len >= self.ngram_size - 1), true_fn, lambda: scores)
+        return output
diff --git a/.venv/lib/python3.11/site-packages/transformers/generation/flax_utils.py b/.venv/lib/python3.11/site-packages/transformers/generation/flax_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..134666d45a3a9e572df0503281af7272a3afaf9d
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/generation/flax_utils.py
@@ -0,0 +1,1027 @@
+# coding=utf-8
+# Copyright 2021 The Google AI Flax Team Authors, and The HuggingFace Inc. team.
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import copy
+import inspect
+import warnings
+from functools import partial
+from typing import Any, Dict, Optional, Union
+
+import flax
+import jax
+import jax.numpy as jnp
+import numpy as np
+from jax import lax
+
+from ..models.auto import (
+    FLAX_MODEL_FOR_CAUSAL_LM_MAPPING,
+    FLAX_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
+    FLAX_MODEL_FOR_VISION_2_SEQ_MAPPING,
+)
+from ..utils import ModelOutput, logging
+from .configuration_utils import GenerationConfig
+from .flax_logits_process import (
+    FlaxForcedBOSTokenLogitsProcessor,
+    FlaxForcedEOSTokenLogitsProcessor,
+    FlaxForceTokensLogitsProcessor,
+    FlaxLogitsProcessorList,
+    FlaxMinLengthLogitsProcessor,
+    FlaxNoRepeatNGramLogitsProcessor,
+    FlaxSuppressTokensAtBeginLogitsProcessor,
+    FlaxSuppressTokensLogitsProcessor,
+    FlaxTemperatureLogitsWarper,
+    FlaxTopKLogitsWarper,
+    FlaxTopPLogitsWarper,
+)
+
+
+logger = logging.get_logger(__name__)
+
+
+@flax.struct.dataclass
+class FlaxGreedySearchOutput(ModelOutput):
+    """
+    Flax Base class for outputs of decoder-only generation models using greedy search.
+
+
+    Args:
+        sequences (`jnp.ndarray` of shape `(batch_size, max_length)`):
+            The generated sequences.
+    """
+
+    sequences: jnp.ndarray = None
+
+
+@flax.struct.dataclass
+class FlaxSampleOutput(ModelOutput):
+    """
+    Flax Base class for outputs of decoder-only generation models using sampling.
+
+
+    Args:
+        sequences (`jnp.ndarray` of shape `(batch_size, max_length)`):
+            The generated sequences.
+    """
+
+    sequences: jnp.ndarray = None
+
+
+@flax.struct.dataclass
+class FlaxBeamSearchOutput(ModelOutput):
+    """
+    Flax Base class for outputs of decoder-only generation models using greedy search.
+
+
+    Args:
+        sequences (`jnp.ndarray` of shape `(batch_size, max_length)`):
+            The generated sequences.
+        scores (`jnp.ndarray` of shape `(batch_size,)`):
+            The scores (log probabilities) of the generated sequences.
+    """
+
+    sequences: jnp.ndarray = None
+    scores: jnp.ndarray = None
+
+
+@flax.struct.dataclass
+class GreedyState:
+    cur_len: jnp.ndarray
+    sequences: jnp.ndarray
+    running_token: jnp.ndarray
+    is_sent_finished: jnp.ndarray
+    model_kwargs: Dict[str, jnp.ndarray]
+
+
+@flax.struct.dataclass
+class SampleState:
+    cur_len: jnp.ndarray
+    sequences: jnp.ndarray
+    running_token: jnp.ndarray
+    is_sent_finished: jnp.ndarray
+    prng_key: jnp.ndarray
+    model_kwargs: Dict[str, jnp.ndarray]
+
+
+@flax.struct.dataclass
+class BeamSearchState:
+    cur_len: jnp.ndarray
+    running_sequences: jnp.ndarray
+    running_scores: jnp.ndarray
+    sequences: jnp.ndarray
+    scores: jnp.ndarray
+    is_sent_finished: jnp.ndarray
+    model_kwargs: Dict[str, jnp.ndarray]
+
+
+class FlaxGenerationMixin:
+    """
+    A class containing all functions for auto-regressive text generation, to be used as a mixin in
+    [`FlaxPreTrainedModel`].
+
+    The class exposes [`~generation.FlaxGenerationMixin.generate`], which can be used for:
+            - *greedy decoding* by calling [`~generation.FlaxGenerationMixin._greedy_search`] if `num_beams=1` and
+              `do_sample=False`
+            - *multinomial sampling* by calling [`~generation.FlaxGenerationMixin._sample`] if `num_beams=1` and
+              `do_sample=True`
+            - *beam-search decoding* by calling [`~generation.FlaxGenerationMixin._beam_search`] if `num_beams>1` and
+              `do_sample=False`
+
+    You do not need to call any of the above methods directly. Pass custom parameter values to 'generate' instead. To
+    learn more about decoding strategies refer to the [text generation strategies guide](../generation_strategies).
+    """
+
+    def prepare_inputs_for_generation(self, *args, **kwargs):
+        raise NotImplementedError(
+            "A model class needs to define a `prepare_inputs_for_generation` method in order to use `generate`."
+        )
+
+    @staticmethod
+    def _run_loop_in_debug(cond_fn, body_fn, init_state):
+        """
+        Run generation in untraced mode. This should only be used for debugging purposes.
+        """
+        state = init_state
+        while cond_fn(state):
+            state = body_fn(state)
+        return state
+
+    def _prepare_encoder_decoder_kwargs_for_generation(self, input_ids, params, model_kwargs):
+        encoder_kwargs = {
+            argument: value
+            for argument, value in model_kwargs.items()
+            if not (argument.startswith("decoder_") or argument.startswith("cross_attn"))
+        }
+        model_kwargs["encoder_outputs"] = self.encode(input_ids, params=params, return_dict=True, **encoder_kwargs)
+        return model_kwargs
+
+    def _prepare_decoder_input_ids_for_generation(
+        self,
+        batch_size: int,
+        decoder_start_token_id: int = None,
+        bos_token_id: int = None,
+        model_kwargs: Optional[Dict[str, jnp.ndarray]] = None,
+    ) -> jnp.ndarray:
+        if model_kwargs is not None and "decoder_input_ids" in model_kwargs:
+            # Only use this arg if not None, otherwise just remove from model_kwargs
+            decoder_input_ids = model_kwargs.pop("decoder_input_ids")
+            if decoder_input_ids is not None:
+                return decoder_input_ids
+        decoder_start_token_id = self._get_decoder_start_token_id(decoder_start_token_id, bos_token_id)
+        return jnp.array(decoder_start_token_id, dtype="i4").reshape(1, -1).repeat(batch_size, axis=0)
+
+    def _get_decoder_start_token_id(self, decoder_start_token_id: int = None, bos_token_id: int = None) -> int:
+        # retrieve decoder_start_token_id for encoder-decoder models
+        # fall back to bos_token_id if necessary
+        decoder_start_token_id = (
+            decoder_start_token_id
+            if decoder_start_token_id is not None
+            else self.generation_config.decoder_start_token_id
+        )
+        bos_token_id = bos_token_id if bos_token_id is not None else self.generation_config.bos_token_id
+        if decoder_start_token_id is not None:
+            return decoder_start_token_id
+        elif (
+            hasattr(self.config, "decoder")
+            and hasattr(self.config.decoder, "decoder_start_token_id")
+            and self.config.decoder.decoder_start_token_id is not None
+        ):
+            return self.config.decoder.decoder_start_token_id
+        elif bos_token_id is not None:
+            return bos_token_id
+        elif (
+            hasattr(self.config, "decoder")
+            and hasattr(self.config.decoder, "bos_token_id")
+            and self.config.decoder.bos_token_id is not None
+        ):
+            return self.config.decoder.bos_token_id
+        raise ValueError(
+            "`decoder_start_token_id` or `bos_token_id` has to be defined for encoder-decoder generation."
+        )
+
+    @staticmethod
+    def _expand_to_num_beams(tensor, num_beams):
+        return jnp.broadcast_to(tensor[:, None], (tensor.shape[0], num_beams) + tensor.shape[1:])
+
+    def _adapt_logits_for_beam_search(self, logits):
+        """
+        This function can be overwritten in the specific modeling_flax_<model-name>.py classes to allow for custom beam
+        search behavior. Note that the only model that overwrites this method is [`~transformes.FlaxMarianMTModel`].
+        """
+        return logits
+
+    def _validate_model_class(self):
+        """
+        Confirms that the model class is compatible with generation. If not, raises an exception that points to the
+        right class to use.
+        """
+        if not self.can_generate():
+            generate_compatible_mappings = [
+                FLAX_MODEL_FOR_CAUSAL_LM_MAPPING,
+                FLAX_MODEL_FOR_VISION_2_SEQ_MAPPING,
+                FLAX_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
+            ]
+            generate_compatible_classes = set()
+            for model_mapping in generate_compatible_mappings:
+                supported_models = model_mapping.get(type(self.config), default=None)
+                if supported_models is not None:
+                    generate_compatible_classes.add(supported_models.__name__)
+            exception_message = (
+                f"The current model class ({self.__class__.__name__}) is not compatible with `.generate()`, as "
+                "it doesn't have a language model head."
+            )
+            if generate_compatible_classes:
+                exception_message += f" Please use one of the following classes instead: {generate_compatible_classes}"
+            raise TypeError(exception_message)
+
+    def _validate_model_kwargs(self, model_kwargs: Dict[str, Any]):
+        """Validates model kwargs for generation. Generate argument typos will also be caught here."""
+        unused_model_args = []
+        model_args = set(inspect.signature(self.prepare_inputs_for_generation).parameters)
+        # `kwargs`/`model_kwargs` is often used to handle optional forward pass inputs like `attention_mask`. If
+        # `prepare_inputs_for_generation` doesn't accept them, then a stricter check can be made ;)
+        if "kwargs" in model_args or "model_kwargs" in model_args:
+            model_args |= set(inspect.signature(self.__call__).parameters)
+        for key, value in model_kwargs.items():
+            if value is not None and key not in model_args:
+                unused_model_args.append(key)
+
+        if unused_model_args:
+            raise ValueError(
+                f"The following `model_kwargs` are not used by the model: {unused_model_args} (note: typos in the"
+                " generate arguments will also show up in this list)"
+            )
+
+    def generate(
+        self,
+        input_ids: jnp.ndarray,
+        generation_config: Optional[GenerationConfig] = None,
+        prng_key: Optional[jnp.ndarray] = None,
+        trace: bool = True,
+        params: Optional[Dict[str, jnp.ndarray]] = None,
+        logits_processor: Optional[FlaxLogitsProcessorList] = None,
+        **kwargs,
+    ):
+        r"""
+        Generates sequences of token ids for models with a language modeling head.
+
+        Parameters:
+            input_ids (`jnp.ndarray` of shape `(batch_size, sequence_length)`):
+                The sequence used as a prompt for the generation.
+            generation_config (`~generation.GenerationConfig`, *optional*):
+                The generation configuration to be used as base parametrization for the generation call. `**kwargs`
+                passed to generate matching the attributes of `generation_config` will override them. If
+                `generation_config` is not provided, the default will be used, which had the following loading
+                priority: 1) from the `generation_config.json` model file, if it exists; 2) from the model
+                configuration. Please note that unspecified parameters will inherit [`~generation.GenerationConfig`]'s
+                default values, whose documentation should be checked to parameterize generation.
+            trace (`bool`, *optional*, defaults to `True`):
+                Whether to trace generation. Setting `trace=False` should only be used for debugging and will lead to a
+                considerably slower runtime.
+            params (`Dict[str, jnp.ndarray]`, *optional*):
+                Optionally the model parameters can be passed. Can be useful for parallelized generation.
+            logits_processor (`FlaxLogitsProcessorList `, *optional*):
+                Custom logits processors that complement the default logits processors built from arguments and
+                generation config. If a logit processor is passed that is already created with the arguments or a
+                generation config an error is thrown. This feature is intended for advanced users.
+            kwargs (`Dict[str, Any]`, *optional*):
+                Ad hoc parametrization of `generate_config` and/or additional model-specific kwargs that will be
+                forwarded to the `forward` function of the model. If the model is an encoder-decoder model, encoder
+                specific kwargs should not be prefixed and decoder specific kwargs should be prefixed with *decoder_*.
+
+        Return:
+            [`~utils.ModelOutput`].
+
+        """
+        # Handle `generation_config` and kwargs that might update it, and validate the `.generate()` call
+        self._validate_model_class()
+
+        # priority: `generation_config` argument > `model.generation_config` (the default generation config)
+        if generation_config is None:
+            # legacy: users may modify the model configuration to control generation. To trigger this legacy behavior,
+            # two conditions must be met
+            # 1) the generation config must have been created from the model config (`_from_model_config` field);
+            # 2) the generation config must have seen no modification since its creation (the hash is the same).
+            if self.generation_config._from_model_config and self.generation_config._original_object_hash == hash(
+                self.generation_config
+            ):
+                new_generation_config = GenerationConfig.from_model_config(self.config)
+                if new_generation_config != self.generation_config:
+                    warnings.warn(
+                        "You have modified the pretrained model configuration to control generation. This is a"
+                        " deprecated strategy to control generation and will be removed soon, in a future version."
+                        " Please use and modify the model generation configuration (see"
+                        " https://huggingface.co/docs/transformers/generation_strategies#default-text-generation-configuration )"
+                    )
+                    self.generation_config = new_generation_config
+            generation_config = self.generation_config
+
+        generation_config = copy.deepcopy(generation_config)
+        model_kwargs = generation_config.update(**kwargs)  # All unused kwargs must be model kwargs
+        self._validate_model_kwargs(model_kwargs.copy())
+
+        logits_processor = logits_processor if logits_processor is not None else FlaxLogitsProcessorList()
+
+        # set init values
+        prng_key = prng_key if prng_key is not None else jax.random.PRNGKey(0)
+
+        if generation_config.pad_token_id is None and generation_config.eos_token_id is not None:
+            if model_kwargs.get("attention_mask") is None:
+                logger.warning(
+                    "The attention mask and the pad token id were not set. As a consequence, you may observe "
+                    "unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results."
+                )
+            eos_token_id = generation_config.eos_token_id
+            if isinstance(eos_token_id, list):
+                eos_token_id = eos_token_id[0]
+            generation_config.pad_token_id = eos_token_id
+
+        if generation_config.decoder_start_token_id is None and self.config.is_encoder_decoder:
+            raise ValueError("`decoder_start_token_id` has to be defined for encoder-decoder generation.")
+
+        # decoder-only models should use left-padding for generation (can't be checked with `trace=True`)
+        if not self.config.is_encoder_decoder and not trace:
+            if (
+                generation_config.pad_token_id is not None
+                and jnp.sum(input_ids[:, -1] == generation_config.pad_token_id) > 0
+            ):
+                logger.warning(
+                    "A decoder-only architecture is being used, but right-padding was detected! For correct "
+                    "generation results, please set `padding_side='left'` when initializing the tokenizer."
+                )
+
+        batch_size = input_ids.shape[0]
+
+        if self.config.is_encoder_decoder:
+            # add encoder_outputs to model_kwargs
+            if model_kwargs.get("encoder_outputs") is None:
+                model_kwargs = self._prepare_encoder_decoder_kwargs_for_generation(input_ids, params, model_kwargs)
+            # prepare decoder_input_ids for generation
+            input_ids = self._prepare_decoder_input_ids_for_generation(
+                batch_size,
+                decoder_start_token_id=generation_config.decoder_start_token_id,
+                bos_token_id=generation_config.bos_token_id,
+                model_kwargs=model_kwargs,
+            )
+
+        # Prepare `max_length` depending on other stopping criteria.
+        input_ids_seq_length = input_ids.shape[-1]
+        has_default_max_length = kwargs.get("max_length") is None and generation_config.max_length is not None
+        if has_default_max_length and generation_config.max_new_tokens is None and generation_config.max_length == 20:
+            # 20 is the default max_length of the generation config
+            warnings.warn(
+                f"Using the model-agnostic default `max_length` (={generation_config.max_length}) "
+                "to control the generation length.  recommend setting `max_new_tokens` to control the maximum length of the generation.",
+                UserWarning,
+            )
+        elif generation_config.max_new_tokens is not None:
+            if not has_default_max_length and generation_config.max_length is not None:
+                logger.warning(
+                    f"Both `max_new_tokens` (={generation_config.max_new_tokens}) and `max_length`(="
+                    f"{generation_config.max_length}) seem to have been set. `max_new_tokens` will take precedence. "
+                    "Please refer to the documentation for more information. "
+                    "(https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)"
+                )
+            generation_config.max_length = generation_config.max_new_tokens + input_ids_seq_length
+        else:  # by default let's always generate 10 new tokens
+            if generation_config.max_length == GenerationConfig().max_length:
+                generation_config.max_length = generation_config.max_length + input_ids_seq_length
+                max_position_embeddings = getattr(self.config, "max_position_embeddings", None)
+                if max_position_embeddings is not None:
+                    generation_config.max_length = min(generation_config.max_length, max_position_embeddings)
+
+        if generation_config.min_length is not None and generation_config.min_length > generation_config.max_length:
+            raise ValueError(
+                f"Unfeasable length constraints: the minimum length ({generation_config.min_length}) is larger than"
+                f" the maximum length ({generation_config.max_length})"
+            )
+        if input_ids_seq_length >= generation_config.max_length:
+            input_ids_string = "decoder_input_ids" if self.config.is_encoder_decoder else "input_ids"
+            logger.warning(
+                f"Input length of {input_ids_string} is {input_ids_seq_length}, but `max_length` is set to"
+                f" {generation_config.max_length}. This can lead to unexpected behavior. You should consider"
+                " increasing`max_new_tokens`."
+            )
+
+        logits_processor = self._get_logits_processor(
+            generation_config=generation_config,
+            input_ids_seq_length=input_ids_seq_length,
+            logits_processor=logits_processor,
+        )
+
+        if not generation_config.do_sample and generation_config.num_beams == 1:
+            return self._greedy_search(
+                input_ids,
+                generation_config.max_length,
+                generation_config.pad_token_id,
+                generation_config.eos_token_id,
+                logits_processor=logits_processor,
+                trace=trace,
+                params=params,
+                model_kwargs=model_kwargs,
+            )
+        elif generation_config.do_sample and generation_config.num_beams == 1:
+            logits_warper = self._get_logits_warper(generation_config=generation_config)
+            return self._sample(
+                input_ids,
+                generation_config.max_length,
+                generation_config.pad_token_id,
+                generation_config.eos_token_id,
+                prng_key,
+                logits_warper=logits_warper,
+                logits_processor=logits_processor,
+                trace=trace,
+                params=params,
+                model_kwargs=model_kwargs,
+            )
+        elif not generation_config.do_sample and generation_config.num_beams > 1:
+            # broadcast input_ids & encoder_outputs
+            input_ids = self._expand_to_num_beams(input_ids, num_beams=generation_config.num_beams)
+
+            if "encoder_outputs" in model_kwargs:
+                model_kwargs["encoder_outputs"]["last_hidden_state"] = self._expand_to_num_beams(
+                    model_kwargs["encoder_outputs"]["last_hidden_state"], num_beams=generation_config.num_beams
+                )
+
+            for kwarg in ["attention_mask", "decoder_attention_mask"]:
+                if kwarg in model_kwargs:
+                    model_kwargs[kwarg] = self._expand_to_num_beams(
+                        model_kwargs[kwarg], num_beams=generation_config.num_beams
+                    )
+
+            return self._beam_search(
+                input_ids,
+                generation_config.max_length,
+                generation_config.pad_token_id,
+                generation_config.eos_token_id,
+                length_penalty=generation_config.length_penalty,
+                early_stopping=generation_config.early_stopping,
+                logits_processor=logits_processor,
+                trace=trace,
+                params=params,
+                num_return_sequences=generation_config.num_return_sequences,
+                model_kwargs=model_kwargs,
+            )
+        else:
+            raise NotImplementedError("`Beam sampling is currently not implemented.")
+
+    def _get_logits_warper(self, generation_config: GenerationConfig) -> FlaxLogitsProcessorList:
+        """
+        This class returns a [`FlaxLogitsProcessorList`] list object that contains all relevant [`FlaxLogitsWarper`]
+        instances used for multinomial sampling.
+        """
+        warpers = FlaxLogitsProcessorList()
+
+        if generation_config.temperature is not None and generation_config.temperature != 1.0:
+            warpers.append(FlaxTemperatureLogitsWarper(generation_config.temperature))
+        if generation_config.top_k is not None and generation_config.top_k != 0:
+            warpers.append(FlaxTopKLogitsWarper(top_k=generation_config.top_k, min_tokens_to_keep=1))
+        if generation_config.top_p is not None and generation_config.top_p < 1.0:
+            warpers.append(FlaxTopPLogitsWarper(top_p=generation_config.top_p, min_tokens_to_keep=1))
+
+        return warpers
+
+    def _get_logits_processor(
+        self,
+        generation_config: GenerationConfig,
+        input_ids_seq_length: int,
+        logits_processor: Optional[FlaxLogitsProcessorList],
+    ) -> FlaxLogitsProcessorList:
+        """
+        This class returns a [`FlaxLogitsProcessorList`] list object that contains all relevant [`FlaxLogitsProcessor`]
+        instances used to modify the scores of the language model head.
+        """
+        processors = FlaxLogitsProcessorList()
+
+        if (
+            generation_config.min_length is not None
+            and generation_config.eos_token_id is not None
+            and generation_config.min_length > -1
+        ):
+            processors.append(
+                FlaxMinLengthLogitsProcessor(generation_config.min_length, generation_config.eos_token_id)
+            )
+        if generation_config.forced_bos_token_id is not None:
+            processors.append(FlaxForcedBOSTokenLogitsProcessor(generation_config.forced_bos_token_id))
+        if generation_config.forced_eos_token_id is not None:
+            processors.append(
+                FlaxForcedEOSTokenLogitsProcessor(generation_config.max_length, generation_config.forced_eos_token_id)
+            )
+        if generation_config.suppress_tokens is not None:
+            processors.append(FlaxSuppressTokensLogitsProcessor(generation_config.suppress_tokens))
+        if generation_config.begin_suppress_tokens is not None:
+            begin_index = input_ids_seq_length
+            begin_index = (
+                begin_index
+                if (input_ids_seq_length > 1 or generation_config.forced_bos_token_id is None)
+                else begin_index + 1
+            )
+            if generation_config.forced_decoder_ids is not None and len(generation_config.forced_decoder_ids) > 0:
+                # generation starts after the last token that is forced
+                begin_index += generation_config.forced_decoder_ids[-1][0]
+            processors.append(
+                FlaxSuppressTokensAtBeginLogitsProcessor(generation_config.begin_suppress_tokens, begin_index)
+            )
+        if generation_config.forced_decoder_ids is not None:
+            forced_decoder_ids = [
+                [input_ids_seq_length + i[0] - 1, i[1]] for i in generation_config.forced_decoder_ids
+            ]
+            processors.append(FlaxForceTokensLogitsProcessor(forced_decoder_ids))
+        if generation_config.no_repeat_ngram_size is not None and generation_config.no_repeat_ngram_size > 0:
+            processors.append(FlaxNoRepeatNGramLogitsProcessor(generation_config.no_repeat_ngram_size))
+        processors = self._merge_criteria_processor_list(processors, logits_processor)
+
+        return processors
+
+    def _merge_criteria_processor_list(
+        self,
+        default_list: FlaxLogitsProcessorList,
+        custom_list: FlaxLogitsProcessorList,
+    ) -> FlaxLogitsProcessorList:
+        if len(custom_list) == 0:
+            return default_list
+        for default in default_list:
+            for custom in custom_list:
+                if type(custom) is type(default):
+                    object_type = "logits processor"
+                    raise ValueError(
+                        f"A custom {object_type} of type {type(custom)} with values {custom} has been passed to"
+                        f" `generate`, but it has already been created with the values {default}. {default} has been"
+                        " created by passing the corresponding arguments to generate or by the model's config default"
+                        f" values. If you just want to change the default values of {object_type} consider passing"
+                        f" them as arguments to `generate` instead of using a custom {object_type}."
+                    )
+        default_list.extend(custom_list)
+        return default_list
+
+    def _greedy_search(
+        self,
+        input_ids: None,
+        max_length: Optional[int] = None,
+        pad_token_id: Optional[int] = None,
+        eos_token_id: Optional[int] = None,
+        logits_processor: Optional[FlaxLogitsProcessorList] = None,
+        trace: bool = True,
+        params: Optional[Dict[str, jnp.ndarray]] = None,
+        model_kwargs: Optional[Dict[str, jnp.ndarray]] = None,
+    ):
+        # init values
+        max_length = max_length if max_length is not None else self.generation_config.max_length
+        pad_token_id = pad_token_id if pad_token_id is not None else self.generation_config.pad_token_id
+        eos_token_id = eos_token_id if eos_token_id is not None else self.generation_config.eos_token_id
+
+        batch_size, cur_len = input_ids.shape
+
+        eos_token_id = jnp.array(eos_token_id, dtype=jnp.int32 if eos_token_id is not None else None)
+        pad_token_id = jnp.array(pad_token_id, dtype=jnp.int32)
+        cur_len = jnp.array(cur_len)
+
+        # per batch-item holding current token in loop.
+        sequences = jnp.full((batch_size, max_length), pad_token_id, dtype=jnp.int32)
+        sequences = lax.dynamic_update_slice(sequences, input_ids, (0, 0))
+
+        # per batch-item state bit indicating if sentence has finished.
+        is_sent_finished = jnp.zeros((batch_size,), dtype=jnp.bool_)
+
+        # For Seq2Seq generation, we only need to use the decoder instead of the whole model in generation loop
+        # and pass it the `encoder_outputs`, which are part of the `model_kwargs`.
+        model = self.decode if self.config.is_encoder_decoder else self
+        # initialize model specific kwargs
+        model_kwargs = self.prepare_inputs_for_generation(input_ids, max_length, **model_kwargs)
+
+        # initialize state
+        state = GreedyState(
+            cur_len=cur_len,
+            sequences=sequences,
+            running_token=input_ids,
+            is_sent_finished=is_sent_finished,
+            model_kwargs=model_kwargs,
+        )
+
+        def greedy_search_cond_fn(state):
+            """state termination condition fn."""
+            has_reached_max_length = state.cur_len == max_length
+            all_sequence_finished = jnp.all(state.is_sent_finished)
+            finish_generation = jnp.logical_or(has_reached_max_length, all_sequence_finished)
+            return ~finish_generation
+
+        def greedy_search_body_fn(state):
+            """state update fn."""
+            model_outputs = model(state.running_token, params=params, **state.model_kwargs)
+            logits = model_outputs.logits[:, -1]
+
+            # apply min_length, ...
+            logits = logits_processor(state.sequences, logits, state.cur_len)
+
+            next_token = jnp.argmax(logits, axis=-1)
+
+            next_token = next_token * ~state.is_sent_finished + pad_token_id * state.is_sent_finished
+            next_is_sent_finished = state.is_sent_finished | (next_token == eos_token_id)
+            next_token = next_token[:, None]
+
+            next_sequences = lax.dynamic_update_slice(state.sequences, next_token, (0, state.cur_len))
+            next_model_kwargs = self.update_inputs_for_generation(model_outputs, state.model_kwargs)
+            return GreedyState(
+                cur_len=state.cur_len + 1,
+                sequences=next_sequences,
+                running_token=next_token,
+                is_sent_finished=next_is_sent_finished,
+                model_kwargs=next_model_kwargs,
+            )
+
+        # The very first prompt often has sequence length > 1, so run outside of `lax.while_loop` to comply with TPU
+        if input_ids.shape[1] > 1:
+            state = greedy_search_body_fn(state)
+
+        if not trace:
+            state = self._run_loop_in_debug(greedy_search_cond_fn, greedy_search_body_fn, state)
+        else:
+            state = lax.while_loop(greedy_search_cond_fn, greedy_search_body_fn, state)
+
+        return FlaxGreedySearchOutput(sequences=state.sequences)
+
+    def _sample(
+        self,
+        input_ids: None,
+        max_length: Optional[int] = None,
+        pad_token_id: Optional[int] = None,
+        eos_token_id: Optional[int] = None,
+        prng_key: Optional[jnp.ndarray] = None,
+        logits_processor: Optional[FlaxLogitsProcessorList] = None,
+        logits_warper: Optional[FlaxLogitsProcessorList] = None,
+        trace: bool = True,
+        params: Optional[Dict[str, jnp.ndarray]] = None,
+        model_kwargs: Optional[Dict[str, jnp.ndarray]] = None,
+    ):
+        # init values
+        max_length = max_length if max_length is not None else self.generation_config.max_length
+        pad_token_id = pad_token_id if pad_token_id is not None else self.generation_config.pad_token_id
+        eos_token_id = eos_token_id if eos_token_id is not None else self.generation_config.eos_token_id
+        prng_key = prng_key if prng_key is not None else jax.random.PRNGKey(0)
+
+        batch_size, cur_len = input_ids.shape
+
+        eos_token_id = jnp.array(eos_token_id, dtype=jnp.int32 if eos_token_id is not None else None)
+        pad_token_id = jnp.array(pad_token_id, dtype=jnp.int32)
+        cur_len = jnp.array(cur_len)
+
+        # per batch-item holding current token in loop.
+        sequences = jnp.full((batch_size, max_length), pad_token_id, dtype=jnp.int32)
+        sequences = lax.dynamic_update_slice(sequences, input_ids, (0, 0))
+
+        # per batch-item state bit indicating if sentence has finished.
+        is_sent_finished = jnp.zeros((batch_size,), dtype=jnp.bool_)
+
+        # For Seq2Seq generation, we only need to use the decoder instead of the whole model in generation loop
+        # and pass it the `encoder_outputs`, which are part of the `model_kwargs`.
+        model = self.decode if self.config.is_encoder_decoder else self
+
+        # initialize model specific kwargs
+        model_kwargs = self.prepare_inputs_for_generation(input_ids, max_length, **model_kwargs)
+
+        # initialize state
+        state = SampleState(
+            cur_len=cur_len,
+            sequences=sequences,
+            running_token=input_ids,
+            is_sent_finished=is_sent_finished,
+            prng_key=prng_key,
+            model_kwargs=model_kwargs,
+        )
+
+        def sample_search_cond_fn(state):
+            """state termination condition fn."""
+            has_reached_max_length = state.cur_len == max_length
+            all_sequence_finished = jnp.all(state.is_sent_finished)
+            finish_generation = jnp.logical_or(has_reached_max_length, all_sequence_finished)
+            return ~finish_generation
+
+        def sample_search_body_fn(state):
+            """state update fn."""
+            prng_key, prng_key_next = jax.random.split(state.prng_key)
+            model_outputs = model(state.running_token, params=params, **state.model_kwargs)
+
+            logits = model_outputs.logits[:, -1]
+
+            # apply min_length, ...
+            logits = logits_processor(state.sequences, logits, state.cur_len)
+            # apply top_p, top_k, temperature
+            logits = logits_warper(logits, logits, state.cur_len)
+
+            next_token = jax.random.categorical(prng_key, logits, axis=-1)
+
+            next_token = next_token * ~state.is_sent_finished + pad_token_id * state.is_sent_finished
+            next_is_sent_finished = state.is_sent_finished | (next_token == eos_token_id)
+            next_token = next_token[:, None]
+
+            next_sequences = lax.dynamic_update_slice(state.sequences, next_token, (0, state.cur_len))
+            next_model_kwargs = self.update_inputs_for_generation(model_outputs, state.model_kwargs)
+
+            return SampleState(
+                cur_len=state.cur_len + 1,
+                sequences=next_sequences,
+                running_token=next_token,
+                is_sent_finished=next_is_sent_finished,
+                model_kwargs=next_model_kwargs,
+                prng_key=prng_key_next,
+            )
+
+        # The very first prompt often has sequence length > 1, so run outside of `lax.while_loop` to comply with TPU
+        if input_ids.shape[1] > 1:
+            state = sample_search_body_fn(state)
+
+        if not trace:
+            state = self._run_loop_in_debug(sample_search_cond_fn, sample_search_body_fn, state)
+        else:
+            state = lax.while_loop(sample_search_cond_fn, sample_search_body_fn, state)
+
+        return FlaxSampleOutput(sequences=state.sequences)
+
+    def _beam_search(
+        self,
+        input_ids: None,
+        max_length: Optional[int] = None,
+        pad_token_id: Optional[int] = None,
+        eos_token_id: Optional[int] = None,
+        length_penalty: Optional[float] = None,
+        early_stopping: Optional[Union[bool, str]] = None,
+        logits_processor: Optional[FlaxLogitsProcessorList] = None,
+        trace: bool = True,
+        params: Optional[Dict[str, jnp.ndarray]] = None,
+        num_return_sequences: Optional[int] = None,
+        model_kwargs: Optional[Dict[str, jnp.ndarray]] = None,
+    ):
+        """
+        This beam search function is heavily inspired by Flax's official example:
+        https://github.com/google/flax/blob/main/examples/wmt/decode.py
+        """
+
+        def flatten_beam_dim(tensor):
+            """Flattens the first two dimensions of a non-scalar array."""
+            # ignore scalars (e.g. cache index)
+            if tensor.ndim == 0:
+                return tensor
+            return tensor.reshape((tensor.shape[0] * tensor.shape[1],) + tensor.shape[2:])
+
+        def unflatten_beam_dim(tensor, batch_size, num_beams):
+            """Unflattens the first, flat batch*beam dimension of a non-scalar array."""
+            # ignore scalars (e.g. cache index)
+            if tensor.ndim == 0:
+                return tensor
+            return tensor.reshape((batch_size, num_beams) + tensor.shape[1:])
+
+        def gather_beams(nested, beam_indices, batch_size, new_num_beams):
+            """
+            Gathers the beam slices indexed by beam_indices into new beam array.
+            """
+            batch_indices = jnp.reshape(
+                jnp.arange(batch_size * new_num_beams) // new_num_beams, (batch_size, new_num_beams)
+            )
+
+            def gather_fn(tensor):
+                # ignore scalars (e.g. cache index)
+                if tensor.ndim == 0:
+                    return tensor
+                else:
+                    return tensor[batch_indices, beam_indices]
+
+            return jax.tree_util.tree_map(gather_fn, nested)
+
+        # init values
+        max_length = max_length if max_length is not None else self.generation_config.max_length
+        pad_token_id = pad_token_id if pad_token_id is not None else self.generation_config.pad_token_id
+        eos_token_id = eos_token_id if eos_token_id is not None else self.generation_config.eos_token_id
+        length_penalty = length_penalty if length_penalty is not None else self.generation_config.length_penalty
+        early_stopping = early_stopping if early_stopping is not None else self.generation_config.early_stopping
+        num_return_sequences = (
+            num_return_sequences if num_return_sequences is not None else self.generation_config.num_return_sequences
+        )
+
+        batch_size, num_beams, cur_len = input_ids.shape
+
+        eos_token_id = jnp.array(eos_token_id, dtype=jnp.int32 if eos_token_id is not None else None)
+        pad_token_id = jnp.array(pad_token_id, dtype=jnp.int32)
+        cur_len = jnp.array(cur_len)
+
+        # record the prompt length of decoder
+        decoder_prompt_len = input_ids.shape[-1]
+
+        # per batch,beam-item holding current token in loop.
+        sequences = jnp.full((batch_size, num_beams, max_length), pad_token_id, dtype=jnp.int32)
+        running_sequences = jnp.full((batch_size, num_beams, max_length), pad_token_id, dtype=jnp.int32)
+        running_sequences = lax.dynamic_update_slice(sequences, input_ids, (0, 0, 0))
+
+        # per batch,beam-item state bit indicating if sentence has finished.
+        is_sent_finished = jnp.zeros((batch_size, num_beams), dtype=jnp.bool_)
+
+        # per batch,beam-item score, logprobs
+        running_scores = jnp.tile(jnp.array([0.0] + [np.array(-1.0e7)] * (num_beams - 1)), [batch_size, 1])
+        scores = jnp.ones((batch_size, num_beams)) * np.array(-1.0e7)
+
+        # For Seq2Seq generation, we only need to use the decoder instead of the whole model in generation loop
+        # and pass it the `encoder_outputs`, which are part of the `model_kwargs`.
+        model = self.decode if self.config.is_encoder_decoder else self
+
+        # flatten beam dim
+        if "encoder_outputs" in model_kwargs:
+            model_kwargs["encoder_outputs"]["last_hidden_state"] = flatten_beam_dim(
+                model_kwargs["encoder_outputs"]["last_hidden_state"]
+            )
+        for kwarg in ["attention_mask", "decoder_attention_mask"]:
+            if kwarg in model_kwargs:
+                model_kwargs[kwarg] = flatten_beam_dim(model_kwargs[kwarg])
+
+        # initialize model specific kwargs
+        model_kwargs = self.prepare_inputs_for_generation(flatten_beam_dim(input_ids), max_length, **model_kwargs)
+
+        # initialize state
+        state = BeamSearchState(
+            cur_len=cur_len,
+            running_sequences=running_sequences,
+            running_scores=running_scores,
+            sequences=sequences,
+            scores=scores,
+            is_sent_finished=is_sent_finished,
+            model_kwargs=model_kwargs,
+        )
+
+        def beam_search_cond_fn(state):
+            """beam search state termination condition fn."""
+
+            # 1. is less than max length?
+            not_max_length_yet = state.cur_len < max_length
+
+            # 2. can the new beams still improve?
+            # early_stopping == False -> apply heuristic = always get the best score from `cur_len`. See the discussion
+            # below for more details.
+            # https://github.com/huggingface/transformers/pull/20901#issuecomment-1369845565
+            # early_stopping == "never" -> compute the best score from max_length or cur_len, depending on the sign of
+            #   length_penalty. Positive length_penalty favors longer sequences, thus we use max_length there.
+            if early_stopping == "never" and length_penalty > 0.0:
+                best_running_score = state.running_scores[:, :1] / (
+                    (max_length - decoder_prompt_len) ** length_penalty
+                )
+            else:
+                best_running_score = state.running_scores[:, :1] / (
+                    (state.cur_len - decoder_prompt_len) ** length_penalty
+                )
+            worst_finished_score = jnp.where(
+                state.is_sent_finished, jnp.min(state.scores, axis=1, keepdims=True), np.array(-1.0e7)
+            )
+            improvement_still_possible = jnp.any(best_running_score > worst_finished_score)
+
+            # 3. is there still a beam that has not finished?
+            still_open_beam = ~(jnp.all(state.is_sent_finished) & (early_stopping is True))
+
+            return not_max_length_yet & still_open_beam & improvement_still_possible
+
+        def beam_search_body_fn(state, input_ids_length=1):
+            """beam search state update fn."""
+            # 1. Forward current tokens
+            # Collect the current position slice along length to feed the fast
+            # autoregressive decoder model.  Flatten the beam dimension into batch
+            # dimension for feeding into the model.
+            # unflatten beam dimension
+            # Unflatten beam dimension in attention cache arrays
+            input_token = flatten_beam_dim(
+                lax.dynamic_slice(
+                    state.running_sequences,
+                    (0, 0, state.cur_len - input_ids_length),
+                    (batch_size, num_beams, input_ids_length),
+                )
+            )
+            model_outputs = model(input_token, params=params, **state.model_kwargs)
+
+            logits = unflatten_beam_dim(model_outputs.logits[:, -1], batch_size, num_beams)
+            cache = jax.tree_util.tree_map(
+                lambda tensor: unflatten_beam_dim(tensor, batch_size, num_beams), model_outputs.past_key_values
+            )
+
+            # adapt logits for FlaxMarianMTModel
+            logits = self._adapt_logits_for_beam_search(logits)
+
+            # 2. Compute log probs
+            # get log probabilities from logits,
+            # process logits with processors (*e.g.* min_length, ...), and
+            # add new logprobs to existing running logprobs scores.
+            log_probs = jax.nn.log_softmax(logits)
+            log_probs = logits_processor(
+                flatten_beam_dim(state.running_sequences), flatten_beam_dim(log_probs), state.cur_len
+            )
+            log_probs = unflatten_beam_dim(log_probs, batch_size, num_beams)
+            log_probs = log_probs + jnp.expand_dims(state.running_scores, axis=2)
+            vocab_size = log_probs.shape[2]
+            log_probs = log_probs.reshape((batch_size, num_beams * vocab_size))
+
+            # 3. Retrieve top-K
+            # Each item in batch has num_beams * vocab_size candidate sequences.
+            # For each item, get the top 2*k candidates with the highest log-
+            # probabilities. We gather the top 2*K beams here so that even if the best
+            # K sequences reach EOS simultaneously, we have another K sequences
+            # remaining to continue the live beam search.
+            # Gather the top 2*K scores from _all_ beams.
+            # Gather 2*k top beams.
+            # Recover the beam index by floor division.
+            # Recover token id by modulo division and expand Id array for broadcasting.
+            # Update sequences for the 2*K top-k new sequences.
+            beams_to_keep = 2 * num_beams
+            topk_log_probs, topk_indices = lax.top_k(log_probs, k=beams_to_keep)
+            topk_beam_indices = topk_indices // vocab_size
+            topk_running_sequences = gather_beams(
+                state.running_sequences, topk_beam_indices, batch_size, beams_to_keep
+            )
+            topk_ids = jnp.expand_dims(topk_indices % vocab_size, axis=2)
+            topk_sequences = lax.dynamic_update_slice(topk_running_sequences, topk_ids, (0, 0, state.cur_len))
+
+            # 4. Check which sequences have ended
+            # Update current sequences:
+            # Did any of these sequences reach an end marker?
+            # To prevent these just finished sequences from being added to the current sequences
+            # set of active beam search sequences, set their log probs to a very large
+            # negative value.
+            did_topk_just_finished = topk_sequences[:, :, state.cur_len] == eos_token_id
+            running_topk_log_probs = topk_log_probs + did_topk_just_finished * np.array(-1.0e7)
+            # 5. Get running sequences scores for next
+            # Determine the top k beam indices (from top 2*k beams) from log probs
+            # and gather top k beams (from top 2*k beams).
+            next_topk_indices = lax.top_k(running_topk_log_probs, k=num_beams)[1]
+            next_running_sequences, next_running_scores = gather_beams(
+                [topk_sequences, running_topk_log_probs], next_topk_indices, batch_size, num_beams
+            )
+
+            # 6. Process topk logits
+            # Further process log probs:
+            # - add length penalty
+            # - make sure no scores can be added anymore if beam is full
+            # - make sure still running sequences cannot be chosen as finalized beam
+            topk_log_probs = topk_log_probs / ((state.cur_len + 1 - decoder_prompt_len) ** length_penalty)
+            beams_in_batch_are_full = jnp.broadcast_to(
+                state.is_sent_finished.all(axis=-1, keepdims=True), did_topk_just_finished.shape
+            ) & (early_stopping is True)
+            add_penalty = ~did_topk_just_finished | beams_in_batch_are_full
+            topk_log_probs += add_penalty * np.array(-1.0e7)
+
+            # 7. Get scores, sequences, is sentence finished for next.
+            # Combine sequences, scores, and flags along the beam dimension and compare
+            # new finished sequence scores to existing finished scores and select the
+            # best from the new set of beams
+            merged_sequences = jnp.concatenate([state.sequences, topk_sequences], axis=1)
+            merged_scores = jnp.concatenate([state.scores, topk_log_probs], axis=1)
+            merged_is_sent_finished = jnp.concatenate([state.is_sent_finished, did_topk_just_finished], axis=1)
+            topk_merged_indices = lax.top_k(merged_scores, k=num_beams)[1]
+            next_sequences, next_scores, next_is_sent_finished = gather_beams(
+                [merged_sequences, merged_scores, merged_is_sent_finished], topk_merged_indices, batch_size, num_beams
+            )
+
+            # 8. Update model kwargs.
+            # Determine the top k beam indices from the original set of all beams.
+            # With these, gather the top k beam-associated caches.
+            next_running_indices = gather_beams(topk_beam_indices, next_topk_indices, batch_size, num_beams)
+            next_cache = gather_beams(cache, next_running_indices, batch_size, num_beams)
+            model_outputs["past_key_values"] = jax.tree_util.tree_map(lambda x: flatten_beam_dim(x), next_cache)
+            next_model_kwargs = self.update_inputs_for_generation(model_outputs, state.model_kwargs)
+
+            return BeamSearchState(
+                cur_len=state.cur_len + 1,
+                running_scores=next_running_scores,
+                running_sequences=next_running_sequences,
+                scores=next_scores,
+                sequences=next_sequences,
+                is_sent_finished=next_is_sent_finished,
+                model_kwargs=next_model_kwargs,
+            )
+
+        # Always run first iteration outside of `lax.while_loop` to avoid calling `beam_search_cond_fn`
+        # when `state.cur_len` equals `decoder_prompt_len`. This also helps to comply with TPU when
+        # the very first prompt has sequence length > 1.
+        state = partial(beam_search_body_fn, input_ids_length=input_ids.shape[-1])(state)
+
+        if not trace:
+            state = self._run_loop_in_debug(beam_search_cond_fn, beam_search_body_fn, state)
+        else:
+            state = lax.while_loop(beam_search_cond_fn, beam_search_body_fn, state)
+
+        # Account for the edge-case where there are no finished sequences for a
+        # particular batch item. If so, return running sequences for that batch item.
+        none_finished = jnp.any(state.is_sent_finished, axis=1)
+        sequences = jnp.where(none_finished[:, None, None], state.sequences, state.running_sequences)
+        scores = jnp.where(none_finished[:, None], state.scores, state.running_scores)
+
+        # Take best beams for each batch (the score is sorted in descending order)
+        sequences = flatten_beam_dim(sequences[:, :num_return_sequences, :])
+        scores = flatten_beam_dim(scores[:, :num_return_sequences])
+
+        return FlaxBeamSearchOutput(sequences=sequences, scores=scores)
diff --git a/.venv/lib/python3.11/site-packages/transformers/generation/logits_process.py b/.venv/lib/python3.11/site-packages/transformers/generation/logits_process.py
new file mode 100644
index 0000000000000000000000000000000000000000..39a38f9139ec1b4cc752c67fecac018edf7cbf79
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/generation/logits_process.py
@@ -0,0 +1,2955 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team and Google DeepMind.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+import math
+from typing import Callable, Iterable, List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+
+from ..pytorch_utils import isin_mps_friendly
+from ..utils import add_start_docstrings
+from ..utils.logging import get_logger
+
+
+logger = get_logger(__name__)
+
+
+LOGITS_PROCESSOR_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. [What are input IDs?](../glossary#input-ids)
+        scores (`torch.FloatTensor` of shape `(batch_size, config.vocab_size)`):
+            Prediction scores of a language modeling head. These can be logits for each vocabulary when not using beam
+            search or log softmax for each vocabulary token when using beam search
+
+    Return:
+        `torch.FloatTensor` of shape `(batch_size, config.vocab_size)`: The processed prediction scores.
+
+"""
+
+
+class LogitsProcessor:
+    """Abstract base class for all logit processors that can be applied during generation."""
+
+    @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+        raise NotImplementedError(
+            f"{self.__class__} is an abstract class. Only classes inheriting this class can be called."
+        )
+
+
+class LogitsWarper:
+    """Abstract base class for all logit warpers that can be applied during generation with multinomial sampling."""
+
+    def __init__(self):
+        logger.warning_once(
+            "`LogitsWarper` is deprecated and will be removed in v4.48. Your class should inherit `LogitsProcessor` "
+            "instead, which has the same properties and interface."
+        )
+
+    @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+        raise NotImplementedError(
+            f"{self.__class__} is an abstract class. Only classes inheriting this class can be called."
+        )
+
+
+class LogitsProcessorList(list):
+    """
+    This class can be used to create a list of [`LogitsProcessor`] to subsequently process a `scores` input tensor.
+    This class inherits from list and adds a specific *__call__* method to apply each [`LogitsProcessor`] to the
+    inputs.
+    """
+
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> torch.FloatTensor:
+        r"""
+        Args:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary. [What are input IDs?](../glossary#input-ids)
+            scores (`torch.FloatTensor` of shape `(batch_size, config.vocab_size)`):
+                Prediction scores of a language modeling head. These can be logits for each vocabulary when not using
+                beam search or log softmax for each vocabulary token when using beam search
+            kwargs (`Dict[str, Any]`, *optional*):
+                Additional kwargs that are specific to a logits processor.
+
+        Return:
+            `torch.FloatTensor` of shape `(batch_size, config.vocab_size)`:
+                The processed prediction scores.
+
+        """
+        for processor in self:
+            function_args = inspect.signature(processor.__call__).parameters
+            if len(function_args) > 2:
+                if not all(arg in kwargs for arg in list(function_args.keys())[2:]):
+                    raise ValueError(
+                        f"Make sure that all the required parameters: {list(function_args.keys())} for "
+                        f"{processor.__class__} are passed to the logits processor."
+                    )
+                scores = processor(input_ids, scores, **kwargs)
+            else:
+                scores = processor(input_ids, scores)
+
+        return scores
+
+
+class MinLengthLogitsProcessor(LogitsProcessor):
+    r"""
+    [`LogitsProcessor`] enforcing a min-length by setting EOS probability to 0. Note that, for decoder-only models
+    like most LLMs, the length includes the prompt.
+
+    Args:
+        min_length (`int`):
+            The minimum length below which the score of `eos_token_id` is set to `-float("Inf")`.
+        eos_token_id (`Union[int, List[int], torch.Tensor]`):
+            The id(s) of the *end-of-sequence* token.
+        device (`str`, *optional*, defaults to `"cpu"`):
+            The device to allocate the tensors.
+
+    Examples:
+
+    ```python
+    >>> from transformers import AutoModelForCausalLM, AutoTokenizer
+
+    >>> tokenizer = AutoTokenizer.from_pretrained("bigscience/bloomz-560m")
+    >>> model = AutoModelForCausalLM.from_pretrained("bigscience/bloomz-560m")
+
+    >>> inputs = tokenizer("A number:", return_tensors="pt")
+    >>> gen_out = model.generate(**inputs)
+    >>> print(tokenizer.batch_decode(gen_out, skip_special_tokens=True)[0])
+    A number: one
+
+    >>> # setting `min_length` to a value smaller than the uncontrolled output length has no impact
+    >>> gen_out = model.generate(**inputs, min_length=3)
+    >>> print(tokenizer.batch_decode(gen_out, skip_special_tokens=True)[0])
+    A number: one
+
+    >>> # setting a larger `min_length` will force the model to generate beyond its natural ending point, which is not
+    >>> # necessarily incorrect
+    >>> gen_out = model.generate(**inputs, min_length=10)
+    >>> print(tokenizer.batch_decode(gen_out, skip_special_tokens=True)[0])
+    A number: one thousand, nine hundred and ninety-four
+    ```
+    """
+
+    def __init__(self, min_length: int, eos_token_id: Union[int, List[int], torch.Tensor], device: str = "cpu"):
+        if not isinstance(min_length, int) or min_length < 0:
+            raise ValueError(f"`min_length` has to be a non-negative integer, but is {min_length}")
+
+        if not isinstance(eos_token_id, torch.Tensor):
+            if isinstance(eos_token_id, int):
+                eos_token_id = [eos_token_id]
+            eos_token_id = torch.tensor(eos_token_id, device=device)
+
+        self.min_length = min_length
+        self.eos_token_id = eos_token_id
+
+    @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+        vocab_tensor = torch.arange(scores.shape[-1], device=scores.device)
+        eos_token_mask = isin_mps_friendly(vocab_tensor, self.eos_token_id)
+        scores_processed = scores.clone()
+        if input_ids.shape[-1] < self.min_length:
+            scores_processed = torch.where(eos_token_mask, -math.inf, scores)
+        return scores_processed
+
+
+class MinNewTokensLengthLogitsProcessor(LogitsProcessor):
+    r"""
+    [`LogitsProcessor`] enforcing a min-length of new tokens by setting EOS (End-Of-Sequence) token probability to 0.
+    Contrarily to [`MinLengthLogitsProcessor`], this processor ignores the prompt.
+
+    Args:
+        prompt_length_to_skip (`int`):
+            The input tokens length. Not a valid argument when used with `generate` as it will automatically assign the
+            input length.
+        min_new_tokens (`int`):
+            The minimum *new* tokens length below which the score of `eos_token_id` is set to `-float("Inf")`.
+        eos_token_id (`Union[int, List[int], torch.Tensor]`):
+            The id(s) of the *end-of-sequence* token.
+        device (`str`, *optional*, defaults to `"cpu"`):
+            The device to allocate the tensors.
+
+    Examples:
+
+    ```python
+    >>> from transformers import AutoModelForCausalLM, AutoTokenizer
+
+    >>> tokenizer = AutoTokenizer.from_pretrained("bigscience/bloomz-560m")
+    >>> model = AutoModelForCausalLM.from_pretrained("bigscience/bloomz-560m")
+
+    >>> inputs = tokenizer(["A number:"], return_tensors="pt")
+    >>> gen_out = model.generate(**inputs)
+    >>> print(tokenizer.batch_decode(gen_out, skip_special_tokens=True)[0])
+    A number: one
+
+    >>> # setting `min_new_tokens` will force the model to generate beyond its natural ending point, which is not
+    >>> # necessarily incorrect
+    >>> gen_out = model.generate(**inputs, min_new_tokens=2)
+    >>> print(tokenizer.batch_decode(gen_out, skip_special_tokens=True)[0])
+    A number: one thousand
+    ```
+    """
+
+    def __init__(
+        self,
+        prompt_length_to_skip: int,
+        min_new_tokens: int,
+        eos_token_id: Union[int, List[int], torch.Tensor],
+        device: str = "cpu",
+    ):
+        for arg_name, arg_value in [
+            ("prompt_length_to_skip", prompt_length_to_skip),
+            ("min_new_tokens", min_new_tokens),
+        ]:
+            if not isinstance(arg_value, int) or arg_value < 0:
+                raise ValueError(f"`{arg_name}` has to be a positive integer, but is {arg_value}")
+
+        if not isinstance(eos_token_id, torch.Tensor):
+            if isinstance(eos_token_id, int):
+                eos_token_id = [eos_token_id]
+            eos_token_id = torch.tensor(eos_token_id, device=device)
+
+        self.prompt_length_to_skip = prompt_length_to_skip
+        self.min_new_tokens = min_new_tokens
+        self.eos_token_id = eos_token_id
+
+    @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+        new_tokens_length = input_ids.shape[-1] - self.prompt_length_to_skip
+        scores_processed = scores.clone()
+        vocab_tensor = torch.arange(scores.shape[-1], device=scores.device)
+        eos_token_mask = isin_mps_friendly(vocab_tensor, self.eos_token_id)
+        if new_tokens_length < self.min_new_tokens:
+            scores_processed = torch.where(eos_token_mask, -math.inf, scores)
+
+        return scores_processed
+
+
+class TemperatureLogitsWarper(LogitsProcessor):
+    r"""
+    [`LogitsProcessor`] for temperature (exponential scaling output probability distribution), which effectively means
+    that it can control the randomness of the predicted tokens. Often used together with [`TopPLogitsWarper`] and
+    [`TopKLogitsWarper`].
+
+    <Tip>
+
+    Make sure that `do_sample=True` is included in the `generate` arguments otherwise the temperature value won't have
+    any effect.
+
+    </Tip>
+
+    Args:
+        temperature (`float`):
+            Strictly positive float value used to modulate the logits distribution. A value smaller than `1` decreases
+            randomness (and vice versa), with `0` being equivalent to shifting all probability mass to the most likely
+            token.
+
+    Examples:
+
+    ```python
+    >>> import torch
+    >>> from transformers import AutoTokenizer, AutoModelForCausalLM, set_seed
+
+    >>> set_seed(0)  # for reproducibility
+
+    >>> tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
+    >>> model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2")
+    >>> model.config.pad_token_id = model.config.eos_token_id
+    >>> inputs = tokenizer(["Hugging Face Company is"], return_tensors="pt")
+
+    >>> # With temperature=1.0, the default, we consistently get random outputs due to random sampling.
+    >>> generate_kwargs = {"max_new_tokens": 10, "do_sample": True, "temperature": 1.0, "num_return_sequences": 2}
+    >>> outputs = model.generate(**inputs, **generate_kwargs)
+    >>> print(tokenizer.batch_decode(outputs, skip_special_tokens=True))
+    ['Hugging Face Company is one of these companies that is going to take a',
+    "Hugging Face Company is a brand created by Brian A. O'Neil"]
+
+    >>> # However, with temperature close to 0, it approximates greedy decoding strategies (invariant)
+    >>> generate_kwargs["temperature"] = 0.0001
+    >>> outputs = model.generate(**inputs, **generate_kwargs)
+    >>> print(tokenizer.batch_decode(outputs, skip_special_tokens=True))
+    ['Hugging Face Company is a company that has been around for over 20 years',
+    'Hugging Face Company is a company that has been around for over 20 years']
+    ```
+    """
+
+    def __init__(self, temperature: float):
+        if not isinstance(temperature, float) or not (temperature > 0):
+            except_msg = (
+                f"`temperature` (={temperature}) has to be a strictly positive float, otherwise your next token "
+                "scores will be invalid."
+            )
+            if isinstance(temperature, float) and temperature == 0.0:
+                except_msg += " If you're looking for greedy decoding strategies, set `do_sample=False`."
+            raise ValueError(except_msg)
+
+        self.temperature = temperature
+
+    @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+        scores_processed = scores / self.temperature
+        return scores_processed
+
+
+class RepetitionPenaltyLogitsProcessor(LogitsProcessor):
+    r"""
+    [`LogitsProcessor`] that prevents the repetition of previous tokens through a penalty. This penalty is applied at
+    most once per token. Note that, for decoder-only models like most LLMs, the considered tokens include the prompt.
+
+    In the original [paper](https://arxiv.org/pdf/1909.05858.pdf), the authors suggest the use of a penalty of around
+    1.2 to achieve a good balance between truthful generation and lack of repetition. To penalize and reduce
+    repetition, use `penalty` values above 1.0, where a higher value penalizes more strongly. To reward and encourage
+    repetition, use `penalty` values between 0.0 and 1.0, where a lower value rewards more strongly.
+
+    Args:
+        penalty (`float`):
+            The parameter for repetition penalty. 1.0 means no penalty. Above 1.0 penalizes previously generated
+            tokens. Between 0.0 and 1.0 rewards previously generated tokens.
+
+    Examples:
+
+    ```py
+    >>> from transformers import AutoTokenizer, AutoModelForCausalLM
+
+    >>> # Initializing the model and tokenizer for it
+    >>> model = AutoModelForCausalLM.from_pretrained("distilbert/distilgpt2")
+    >>> tokenizer = AutoTokenizer.from_pretrained("distilbert/distilgpt2")
+    >>> inputs = tokenizer(["I'm not going to"], return_tensors="pt")
+
+    >>> # This shows a normal generate without any specific parameters
+    >>> summary_ids = model.generate(**inputs)
+    >>> print(tokenizer.batch_decode(summary_ids, skip_special_tokens=True)[0])
+    I'm not going to be able to do that. I'm going to be able to do that
+
+    >>> # This generates a penalty for repeated tokens
+    >>> penalized_ids = model.generate(**inputs, repetition_penalty=1.1)
+    >>> print(tokenizer.batch_decode(penalized_ids, skip_special_tokens=True)[0])
+    I'm not going to be able to do that. I'll just have to go out and play
+    ```
+    """
+
+    def __init__(self, penalty: float):
+        if not isinstance(penalty, float) or not (penalty > 0):
+            raise ValueError(f"`penalty` has to be a strictly positive float, but is {penalty}")
+
+        self.penalty = penalty
+
+    @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+        score = torch.gather(scores, 1, input_ids)
+
+        # if score < 0 then repetition penalty has to be multiplied to reduce the token probabilities
+        score = torch.where(score < 0, score * self.penalty, score / self.penalty)
+
+        scores_processed = scores.scatter(1, input_ids, score)
+        return scores_processed
+
+
+class EncoderRepetitionPenaltyLogitsProcessor(LogitsProcessor):
+    r"""
+    [`LogitsProcessor`] that works similarly to [`RepetitionPenaltyLogitsProcessor`], but with an *inverse* penalty
+    that is applied to the tokens present in the prompt. In other words, a penalty above 1.0 increases the odds of
+    selecting tokens that were present in the prompt.
+
+    It was designed to avoid hallucination in input-grounded tasks, like summarization. Although originally intended
+    for encoder-decoder models, it can also be used with decoder-only models like LLMs.
+
+    Args:
+        penalty (`float`):
+            The parameter for repetition penalty. 1.0 means no penalty. Above 1.0 rewards prompt tokens. Between 0.0
+            and 1.0 penalizes prompt tokens.
+        encoder_input_ids (`torch.LongTensor`):
+            The encoder_input_ids that should be repeated within the decoder ids.
+
+    Examples:
+
+    ```python
+    >>> from transformers import AutoModelForCausalLM, AutoTokenizer
+
+    >>> tokenizer = AutoTokenizer.from_pretrained("bigscience/bloomz-560m")
+    >>> model = AutoModelForCausalLM.from_pretrained("bigscience/bloomz-560m")
+
+    >>> inputs = tokenizer(["Alice and Bob. The third member's name was"], return_tensors="pt")
+    >>> gen_out = model.generate(**inputs)
+    >>> print(tokenizer.batch_decode(gen_out, skip_special_tokens=True)[0])
+    Alice and Bob. The third member's name was not mentioned.
+
+    >>> # With the `encoder_repetition_penalty` argument we can trigger this logits processor in `generate`, which can
+    >>> # promote the use of prompt tokens ("Bob" in this example)
+    >>> gen_out = model.generate(**inputs, encoder_repetition_penalty=1.2)
+    >>> print(tokenizer.batch_decode(gen_out, skip_special_tokens=True)[0])
+    Alice and Bob. The third member's name was Bob. The third member's name was Bob.
+    ```
+    """
+
+    def __init__(self, penalty: float, encoder_input_ids: torch.LongTensor):
+        if not isinstance(penalty, float) or not (penalty > 0):
+            raise ValueError(f"`penalty` has to be a strictly positive float, but is {penalty}")
+
+        self.penalty = 1 / penalty
+        self.encoder_input_ids = encoder_input_ids
+
+    @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+        score = torch.gather(scores, 1, self.encoder_input_ids)
+
+        # if score < 0 then hallucination penalty has to be multiplied to increase the token probabilities
+        score = torch.where(score < 0, score * self.penalty, score / self.penalty)
+
+        scores_processed = scores.scatter(1, self.encoder_input_ids, score)
+        return scores_processed
+
+
+class TopPLogitsWarper(LogitsProcessor):
+    """
+    [`LogitsProcessor`] that performs top-p, i.e. restricting to top tokens summing to prob_cut_off <= prob_cut_off.
+    Often used together with [`TemperatureLogitsWarper`] and [`TopKLogitsWarper`].
+
+    Args:
+        top_p (`float`):
+            If set to < 1, only the smallest set of most probable tokens with probabilities that add up to `top_p` or
+            higher are kept for generation.
+        filter_value (`float`, *optional*, defaults to -inf):
+            All filtered values will be set to this float value.
+        min_tokens_to_keep (`int`, *optional*, defaults to 1):
+            Minimum number of tokens that cannot be filtered.
+
+    Examples:
+
+    ```python
+    >>> from transformers import AutoTokenizer, AutoModelForCausalLM, set_seed
+
+    >>> set_seed(1)
+    >>> model = AutoModelForCausalLM.from_pretrained("distilbert/distilgpt2")
+    >>> tokenizer = AutoTokenizer.from_pretrained("distilbert/distilgpt2")
+
+    >>> inputs = tokenizer("A sequence: 1, 2", return_tensors="pt")
+
+    >>> # With sampling, the output is unexpected -- sometimes too unexpected.
+    >>> outputs = model.generate(**inputs, do_sample=True)
+    >>> print(tokenizer.batch_decode(outputs, skip_special_tokens=True)[0])
+    A sequence: 1, 2, 3 | < 4 (left-hand pointer) ;
+    <BLANKLINE>
+    <BLANKLINE>
+
+    >>> # With `top_p` sampling, the output gets restricted to high-probability tokens.
+    >>> # Pro tip: In practice, LLMs use `top_p` in the 0.9-0.95 range.
+    >>> outputs = model.generate(**inputs, do_sample=True, top_p=0.1)
+    >>> print(tokenizer.batch_decode(outputs, skip_special_tokens=True)[0])
+    A sequence: 1, 2, 3, 4, 5, 6, 7, 8, 9
+    ```
+    """
+
+    def __init__(self, top_p: float, filter_value: float = -float("Inf"), min_tokens_to_keep: int = 1):
+        top_p = float(top_p)
+        if top_p < 0 or top_p > 1.0:
+            raise ValueError(f"`top_p` has to be a float > 0 and < 1, but is {top_p}")
+        if not isinstance(min_tokens_to_keep, int) or (min_tokens_to_keep < 1):
+            raise ValueError(f"`min_tokens_to_keep` has to be a positive integer, but is {min_tokens_to_keep}")
+
+        self.top_p = top_p
+        self.filter_value = filter_value
+        self.min_tokens_to_keep = min_tokens_to_keep
+
+    @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+        sorted_logits, sorted_indices = torch.sort(scores, descending=False)
+        cumulative_probs = sorted_logits.softmax(dim=-1).cumsum(dim=-1)
+
+        # Remove tokens with cumulative top_p above the threshold (token with 0 are kept)
+        sorted_indices_to_remove = cumulative_probs <= (1 - self.top_p)
+        # Keep at least min_tokens_to_keep
+        sorted_indices_to_remove[..., -self.min_tokens_to_keep :] = 0
+
+        # scatter sorted tensors to original indexing
+        indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove)
+        scores_processed = scores.masked_fill(indices_to_remove, self.filter_value)
+        return scores_processed
+
+
+class TopKLogitsWarper(LogitsProcessor):
+    r"""
+    [`LogitsProcessor`] that performs top-k, i.e. restricting to the k highest probability elements. Often used
+    together with [`TemperatureLogitsWarper`] and [`TopPLogitsWarper`].
+
+    Args:
+        top_k (`int`):
+            The number of highest probability vocabulary tokens to keep for top-k-filtering.
+        filter_value (`float`, *optional*, defaults to -inf):
+            All filtered values will be set to this float value.
+        min_tokens_to_keep (`int`, *optional*, defaults to 1):
+            Minimum number of tokens that cannot be filtered.
+
+    Examples:
+
+    ```python
+    >>> from transformers import AutoTokenizer, AutoModelForCausalLM, set_seed
+
+    >>> set_seed(1)
+    >>> model = AutoModelForCausalLM.from_pretrained("distilbert/distilgpt2")
+    >>> tokenizer = AutoTokenizer.from_pretrained("distilbert/distilgpt2")
+
+    >>> inputs = tokenizer("A sequence: A, B, C, D", return_tensors="pt")
+
+    >>> # With sampling, the output is unexpected -- sometimes too unexpected.
+    >>> outputs = model.generate(**inputs, do_sample=True)
+    >>> print(tokenizer.batch_decode(outputs, skip_special_tokens=True)[0])
+    A sequence: A, B, C, D, E — S — O, P — R
+
+    >>> # With `top_k` sampling, the output gets restricted the k most likely tokens.
+    >>> # Pro tip: In practice, LLMs use `top_k` in the 5-50 range.
+    >>> outputs = model.generate(**inputs, do_sample=True, top_k=2)
+    >>> print(tokenizer.batch_decode(outputs, skip_special_tokens=True)[0])
+    A sequence: A, B, C, D, E, F, G, H, I
+    ```
+    """
+
+    def __init__(self, top_k: int, filter_value: float = -float("Inf"), min_tokens_to_keep: int = 1):
+        if not isinstance(top_k, int) or top_k <= 0:
+            raise ValueError(f"`top_k` has to be a strictly positive integer, but is {top_k}")
+
+        self.top_k = max(top_k, min_tokens_to_keep)
+        self.filter_value = filter_value
+
+    @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+        top_k = min(self.top_k, scores.size(-1))  # Safety check
+        # Remove all tokens with a probability less than the last token of the top-k
+        indices_to_remove = scores < torch.topk(scores, top_k)[0][..., -1, None]
+        scores_processed = scores.masked_fill(indices_to_remove, self.filter_value)
+        return scores_processed
+
+
+class MinPLogitsWarper(LogitsProcessor):
+    """
+    [`LogitsProcessor`] that performs min-p, i.e. keeps all tokens that are above a minimum probability, scaled by the
+    probability of the most likely token. As a result, the filter becomes more agressive in the presence of
+    high-probability tokens, which is a sign of a confident output that we shouldn't deviate from.
+
+    Often used together with [`TemperatureLogitsWarper`]. Used as an alternative to [`TopPLogitsWarper`] and
+    [`TopKLogitsWarper`].
+
+    Created by @menhguin and @kalomaze (github handles). Code adapted from [this external PR](https://github.com/oobabooga/text-generation-webui/pull/4449/files)
+
+    Args:
+        min_p (`float`):
+            Minimum token probability, which will be scaled by the probability of the most likely token. It must be a
+            value between 0 and 1. Typical values are in the 0.01-0.2 range, comparably selective as setting `top_p` in
+            the 0.99-0.8 range (use the opposite of normal `top_p` values).
+        filter_value (`float`, *optional*, defaults to -inf):
+            All filtered values will be set to this float value.
+        min_tokens_to_keep (`int`, *optional*, defaults to 1):
+            Minimum number of tokens that cannot be filtered.
+
+    Examples:
+
+    ```python
+    >>> from transformers import AutoTokenizer, AutoModelForCausalLM, set_seed
+
+    >>> set_seed(1)
+    >>> model = AutoModelForCausalLM.from_pretrained("distilbert/distilgpt2")
+    >>> tokenizer = AutoTokenizer.from_pretrained("distilbert/distilgpt2")
+
+    >>> inputs = tokenizer("A sequence: 1, 2", return_tensors="pt")
+
+    >>> # With sampling, the output is unexpected -- sometimes too unexpected.
+    >>> outputs = model.generate(**inputs, do_sample=True)
+    >>> print(tokenizer.batch_decode(outputs, skip_special_tokens=True)[0])
+    A sequence: 1, 2, 3 | < 4 (left-hand pointer) ;
+    <BLANKLINE>
+    <BLANKLINE>
+
+    >>> # With `min_p` sampling, the output gets restricted to high-probability tokens.
+    >>> # Pro tip: In practice, LLMs use `min_p` in the 0.01-0.2 range.
+    >>> outputs = model.generate(**inputs, do_sample=True, min_p=0.1)
+    >>> print(tokenizer.batch_decode(outputs, skip_special_tokens=True)[0])
+    A sequence: 1, 2, 3, 4, 5, 6, 7, 8, 9
+    ```
+    """
+
+    def __init__(self, min_p: float, filter_value: float = -float("Inf"), min_tokens_to_keep: int = 1):
+        if not (0 <= min_p <= 1.0):
+            raise ValueError(f"`min_p` has to be a float in the [0, 1] interval, but is {min_p}")
+        if not isinstance(min_tokens_to_keep, int) or (min_tokens_to_keep < 1):
+            raise ValueError(f"`min_tokens_to_keep` has to be a positive integer, but is {min_tokens_to_keep}")
+
+        self.min_p = min_p
+        self.filter_value = filter_value
+        self.min_tokens_to_keep = min_tokens_to_keep
+
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+        # Convert logits to probabilities
+        probs = torch.softmax(scores, dim=-1)
+        # Get the probability of the top token for each sequence in the batch
+        top_probs, _ = probs.max(dim=-1, keepdim=True)
+        # Calculate the actual min_p threshold by scaling min_p with the top token's probability
+        scaled_min_p = self.min_p * top_probs
+        # Create a mask for tokens that have a probability less than the scaled min_p
+        tokens_to_remove = probs < scaled_min_p
+
+        sorted_indices = torch.argsort(scores, descending=True, dim=-1)
+        sorted_indices_to_remove = torch.gather(tokens_to_remove, dim=-1, index=sorted_indices)
+        # Keep at least min_tokens_to_keep
+        sorted_indices_to_remove[..., : self.min_tokens_to_keep] = False
+
+        indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove)
+        scores_processed = scores.masked_fill(indices_to_remove, self.filter_value)
+        return scores_processed
+
+
+class TypicalLogitsWarper(LogitsProcessor):
+    r"""
+    [`LogitsProcessor`] that performs typical decoding. Inspired on how humans use language, it prioritizes tokens
+    whose log probability is close to the entropy of the token probability distribution. This means that the most
+    likely tokens may be discarded in the process.
+
+    See [Typical Decoding for Natural Language Generation](https://arxiv.org/abs/2202.00666) for more information.
+
+    Args:
+        mass (`float`, *optional*, defaults to 0.9):
+            Value of typical_p between 0 and 1 inclusive, defaults to 0.9.
+        filter_value (`float`, *optional*, defaults to -inf):
+            All filtered values will be set to this float value.
+        min_tokens_to_keep (`int`, *optional*, defaults to 1):
+            Minimum number of tokens that cannot be filtered.
+
+    Examples:
+
+    ```python
+    >>> from transformers import AutoTokenizer, AutoModelForCausalLM, set_seed
+
+    >>> model = AutoModelForCausalLM.from_pretrained("bigscience/bloomz-560m")
+    >>> tokenizer = AutoTokenizer.from_pretrained("bigscience/bloomz-560m")
+
+    >>> inputs = tokenizer("1, 2, 3", return_tensors="pt")
+
+    >>> # We can see that greedy decoding produces a sequence of numbers
+    >>> outputs = model.generate(**inputs)
+    >>> print(tokenizer.batch_decode(outputs, skip_special_tokens=True)[0])
+    1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
+
+    >>> # For this particular seed, we can see that sampling produces nearly the same low-information (= low entropy)
+    >>> # sequence
+    >>> set_seed(18)
+    >>> outputs = model.generate(**inputs, do_sample=True)
+    >>> print(tokenizer.batch_decode(outputs, skip_special_tokens=True)[0])
+    1, 2, 3, 4, 5, 6, 7, 8, 9 and 10
+
+    >>> # With `typical_p` set, the most obvious sequence is no longer produced, which may be good for your problem
+    >>> set_seed(18)
+    >>> outputs = model.generate(
+    ...     **inputs, do_sample=True, typical_p=0.1, return_dict_in_generate=True, output_scores=True
+    ... )
+    >>> print(tokenizer.batch_decode(outputs.sequences, skip_special_tokens=True)[0])
+    1, 2, 3 and 5
+
+    >>> # We can see that the token corresponding to "4" (token 934) in the second position, the most likely token
+    >>> # as seen with greedy decoding, was entirely blocked out
+    >>> print(outputs.scores[1][0, 934])
+    tensor(-inf)
+    ```
+    """
+
+    def __init__(self, mass: float = 0.9, filter_value: float = -float("Inf"), min_tokens_to_keep: int = 1):
+        mass = float(mass)
+        if not (mass > 0 and mass < 1):
+            raise ValueError(f"`typical_p` has to be a float > 0 and < 1, but is {mass}")
+        if not isinstance(min_tokens_to_keep, int) or (min_tokens_to_keep < 1):
+            raise ValueError(f"`min_tokens_to_keep` has to be a positive integer, but is {min_tokens_to_keep}")
+
+        self.filter_value = filter_value
+        self.mass = mass
+        self.min_tokens_to_keep = min_tokens_to_keep
+
+    @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+        # calculate entropy
+        normalized = torch.nn.functional.log_softmax(scores, dim=-1)
+        p = torch.exp(normalized)
+        ent = -(normalized * p).nansum(-1, keepdim=True)
+
+        # shift and sort
+        shifted_scores = torch.abs((-normalized) - ent)
+        sorted_scores, sorted_indices = torch.sort(shifted_scores, descending=False)
+        sorted_logits = scores.gather(-1, sorted_indices)
+        cumulative_probs = sorted_logits.softmax(dim=-1).cumsum(dim=-1)
+
+        # Remove tokens with cumulative mass above the threshold
+        last_ind = (cumulative_probs < self.mass).sum(dim=1)
+        last_ind.clamp_(max=sorted_scores.shape[-1] - 1)
+        sorted_indices_to_remove = sorted_scores > sorted_scores.gather(1, last_ind.view(-1, 1))
+        sorted_indices_to_remove[..., : self.min_tokens_to_keep] = 0
+        indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove)
+
+        scores_processed = scores.masked_fill(indices_to_remove, self.filter_value)
+        return scores_processed
+
+
+class EpsilonLogitsWarper(LogitsProcessor):
+    r"""
+    [`LogitsProcessor`] that performs epsilon-sampling, i.e. restricting to tokens with `prob >= epsilon`. Takes the
+    largest min_tokens_to_keep tokens if no tokens satisfy this constraint. See [Truncation Sampling as Language Model
+    Desmoothing](https://arxiv.org/abs/2210.15191) for more information.
+
+    Args:
+        epsilon (`float`):
+            If set to > 0, only the most tokens with probabilities `epsilon` or higher are kept for generation.
+        filter_value (`float`, *optional*, defaults to -inf):
+            All filtered values will be set to this float value.
+        min_tokens_to_keep (`int`, *optional*, defaults to 1):
+            Minimum number of tokens that cannot be filtered.
+
+    Examples:
+    ```python
+    >>> from transformers import AutoTokenizer, AutoModelForCausalLM, set_seed
+
+    >>> set_seed(1)
+    >>> model = AutoModelForCausalLM.from_pretrained("distilbert/distilgpt2")
+    >>> tokenizer = AutoTokenizer.from_pretrained("distilbert/distilgpt2")
+
+    >>> inputs = tokenizer("A sequence: 1, 2", return_tensors="pt")
+
+    >>> # With sampling, the output is unexpected -- sometimes too unexpected.
+    >>> outputs = model.generate(**inputs, do_sample=True)
+    >>> print(tokenizer.batch_decode(outputs, skip_special_tokens=True)[0])
+    A sequence: 1, 2, 3 | < 4 (left-hand pointer) ;
+    <BLANKLINE>
+    <BLANKLINE>
+
+    >>> # With epsilon sampling, the output gets restricted to high-probability tokens. Note that this is similar to
+    >>> # Top P sampling, which restricts tokens based on their cumulative probability.
+    >>> # Pro tip: The paper recomends using `epsilon_cutoff` values between 3e-4 and 9e-4
+    >>> outputs = model.generate(**inputs, do_sample=True, epsilon_cutoff=0.1)
+    >>> print(tokenizer.batch_decode(outputs, skip_special_tokens=True)[0])
+    A sequence: 1, 2, 3, 4, 5, 6, 7, 8, 9
+    ```
+    """
+
+    def __init__(self, epsilon: float, filter_value: float = -float("Inf"), min_tokens_to_keep: int = 1):
+        epsilon = float(epsilon)
+        if epsilon <= 0 or epsilon >= 1:
+            raise ValueError(f"`epsilon_cutoff` has to be a float > 0 and < 1, but is {epsilon}")
+
+        min_tokens_to_keep = int(min_tokens_to_keep)
+        if min_tokens_to_keep < 1:
+            raise ValueError(
+                f"`min_tokens_to_keep` has to be a strictly positive integer, but is {min_tokens_to_keep}"
+            )
+
+        self.epsilon = epsilon
+        self.filter_value = filter_value
+        self.min_tokens_to_keep = min_tokens_to_keep
+
+    @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+        # Determine which indices to remove
+        probabilities = scores.softmax(dim=-1)
+        indices_to_remove = probabilities < self.epsilon
+
+        # Keep the words with the 'min_tokens_to_keep'-highest probabilities
+        top_k = min(self.min_tokens_to_keep, scores.size(-1))  # Safety check
+        indices_to_remove = indices_to_remove & (scores < torch.topk(scores, top_k)[0][..., -1, None])
+
+        scores_processed = scores.masked_fill(indices_to_remove, self.filter_value)
+        return scores_processed
+
+
+class EtaLogitsWarper(LogitsProcessor):
+    r"""
+    [`LogitsProcessor`] that performs eta-sampling, a technique to filter out tokens with probabilities below a dynamic
+    cutoff value, `eta`, which is calculated based on a combination of the hyperparameter `epsilon` and the entropy of
+    the token probabilities, i.e. `eta := min(epsilon, sqrt(epsilon * e^-entropy(probabilities)))`. Takes the largest
+    min_tokens_to_keep tokens if no tokens satisfy this constraint. It addresses the issue of poor quality in long
+    samples of text generated by neural language models leading to more coherent and fluent text. See [Truncation
+    Sampling as Language Model Desmoothing](https://arxiv.org/abs/2210.15191) for more information. Note: `do_sample`
+    must be set to `True` for this `LogitsProcessor` to work.
+
+
+    Args:
+        epsilon (`float`):
+            A float value in the range (0, 1). Hyperparameter used to calculate the dynamic cutoff value, `eta`. The
+            suggested values from the paper ranges from 3e-4 to 4e-3 depending on the size of the model.
+        filter_value (`float`, *optional*, defaults to -inf):
+            All values that are found to be below the dynamic cutoff value, `eta`, are set to this float value. This
+            parameter is useful when logits need to be modified for very low probability tokens that should be excluded
+            from generation entirely.
+        min_tokens_to_keep (`int`, *optional*, defaults to 1):
+            Specifies the minimum number of tokens that must be kept for generation, regardless of their probabilities.
+            For example, if `min_tokens_to_keep` is set to 1, at least one token will always be kept for generation,
+            even if all tokens have probabilities below the cutoff `eta`.
+        device (`str`, *optional*, defaults to `"cpu"`):
+            The device to allocate the tensors.
+
+    Examples:
+    ```python
+    >>> from transformers import AutoTokenizer, AutoModelForCausalLM, set_seed
+
+    >>> set_seed(1)
+    >>> model = AutoModelForCausalLM.from_pretrained("distilbert/distilgpt2")
+    >>> tokenizer = AutoTokenizer.from_pretrained("distilbert/distilgpt2")
+
+    >>> inputs = tokenizer("A sequence: 1, 2", return_tensors="pt")
+
+    >>> # With sampling, the output is unexpected -- sometimes too unexpected.
+    >>> outputs = model.generate(**inputs, do_sample=True)
+    >>> print(tokenizer.batch_decode(outputs, skip_special_tokens=True)[0])
+    A sequence: 1, 2, 3 | < 4 (left-hand pointer) ;
+    <BLANKLINE>
+    <BLANKLINE>
+
+    >>> # With eta sampling, the output gets restricted to high-probability tokens. You can see it as a dynamic form of
+    >>> # epsilon sampling that adapts its cutoff probability based on the entropy (high entropy = lower cutoff).
+    >>> # Pro tip: The paper recomends using `eta_cutoff` values between 3e-4 to 4e-3
+    >>> outputs = model.generate(**inputs, do_sample=True, eta_cutoff=0.1)
+    >>> print(tokenizer.batch_decode(outputs, skip_special_tokens=True)[0])
+    A sequence: 1, 2, 3, 4, 5, 6, 7, 8, 9
+    ```
+    """
+
+    def __init__(
+        self, epsilon: float, filter_value: float = -float("Inf"), min_tokens_to_keep: int = 1, device: str = "cpu"
+    ):
+        epsilon = float(epsilon)
+        if epsilon <= 0 or epsilon >= 1:
+            raise ValueError(f"`eta_cutoff` has to be a float > 0 and < 1, but is {epsilon}")
+
+        min_tokens_to_keep = int(min_tokens_to_keep)
+        if min_tokens_to_keep < 1:
+            raise ValueError(
+                f"`min_tokens_to_keep` has to be a strictly positive integer, but is {min_tokens_to_keep}"
+            )
+
+        self.epsilon = torch.tensor(epsilon, device=device)
+        self.filter_value = filter_value
+        self.min_tokens_to_keep = min_tokens_to_keep
+
+    @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+        probabilities = scores.softmax(dim=-1)
+        entropy = torch.distributions.Categorical(logits=scores).entropy()
+        eta = torch.min(self.epsilon, torch.sqrt(self.epsilon) * torch.exp(-entropy))[..., None]
+        indices_to_remove = probabilities < eta
+
+        # Keep the words with the 'min_tokens_to_keep'-highest probabilities
+        top_k = min(self.min_tokens_to_keep, scores.size(-1))  # Safety check
+        indices_to_remove = indices_to_remove & (scores < torch.topk(scores, top_k)[0][..., -1, None])
+
+        scores_processed = scores.masked_fill(indices_to_remove, self.filter_value)
+        return scores_processed
+
+
+def _get_ngrams(ngram_size: int, prev_input_ids: torch.Tensor, num_hypos: int):
+    """
+    Assume ngram_size=2 and prev_input_ids=tensor([[40, 2883, 2712, 4346]]). The output of generated ngrams look like
+    this {(40,): [2883], (2883,): [2712], (2712,): [4346]}.
+
+    Args:
+        ngram_size (`int`):
+            The number sequential tokens taken as a group which may only occur once before being banned.
+        prev_input_ids (`torch.Tensor`):
+           Generated token ids for the current hypothesis.
+        num_hypos (`int`):
+            The number of hypotheses for which n-grams need to be generated.
+
+    Returns:
+        generated_ngrams (`dict`):
+            Dictionary of generated ngrams.
+    """
+    # Initialize an empty list of dictionaries, one for each hypothesis (index) in the range of num_hypos
+    generated_ngrams = [{} for _ in range(num_hypos)]
+    for idx in range(num_hypos):
+        gen_tokens = prev_input_ids[idx].tolist()
+        generated_ngram = generated_ngrams[idx]
+        # Loop through each n-gram of size ngram_size in the list of tokens (gen_tokens)
+        for ngram in zip(*[gen_tokens[i:] for i in range(ngram_size)]):
+            prev_ngram_tuple = tuple(ngram[:-1])
+            generated_ngram[prev_ngram_tuple] = generated_ngram.get(prev_ngram_tuple, []) + [ngram[-1]]
+    return generated_ngrams
+
+
+def _get_generated_ngrams(banned_ngrams, prev_input_ids, ngram_size, cur_len):
+    """
+    Determines the banned tokens for the current hypothesis based on previously generated n-grams.
+
+    Args:
+        banned_ngrams (`dict`):
+            A dictionary containing previously generated n-grams for each hypothesis.
+        prev_input_ids (`torch.Tensor`):
+            Generated token ids for the current hypothesis.
+        ngram_size (`int`):
+            The number sequential tokens taken as a group which may only occur once before being banned.
+        cur_len (`int`):
+            The current length of the token sequences for which the n-grams are being checked.
+
+    Returns:
+        List of tokens that are banned.
+    """
+    # Before decoding the next token, prevent decoding of ngrams that have already appeared
+    start_idx = cur_len + 1 - ngram_size
+    ngram_idx = tuple(prev_input_ids[start_idx:cur_len].tolist())
+    return banned_ngrams.get(ngram_idx, [])
+
+
+def _calc_banned_ngram_tokens(
+    ngram_size: int, prev_input_ids: torch.Tensor, num_hypos: int, cur_len: int
+) -> List[Iterable[int]]:
+    """Copied from fairseq for no_repeat_ngram in beam_search"""
+    if cur_len + 1 < ngram_size:
+        # return no banned tokens if we haven't generated no_repeat_ngram_size tokens yet
+        return [[] for _ in range(num_hypos)]
+    generated_ngrams = _get_ngrams(ngram_size, prev_input_ids, num_hypos)
+    banned_tokens = [
+        _get_generated_ngrams(generated_ngrams[hypo_idx], prev_input_ids[hypo_idx], ngram_size, cur_len)
+        for hypo_idx in range(num_hypos)
+    ]
+    return banned_tokens
+
+
+class NoRepeatNGramLogitsProcessor(LogitsProcessor):
+    r"""
+    N-grams are groups of "n" consecutive words, characters, or tokens taken from a sequence of text. Given the
+    sentence: "She runs fast", the bi-grams (n=2) would be ("she", "runs") and ("runs", "fast"). In text generation,
+    avoiding repetitions of word sequences provides a more diverse output. This [`LogitsProcessor`] enforces no
+    repetition of n-grams by setting the scores of banned tokens to negative infinity which eliminates those tokens
+    from consideration when further processing the scores. Note that, for decoder-only models like most LLMs, the
+    prompt is also considered to obtain the n-grams.
+    [Fairseq](https://github.com/pytorch/fairseq/blob/a07cb6f40480928c9e0548b737aadd36ee66ac76/fairseq/sequence_generator.py#L345).
+
+    <Tip>
+
+    Use n-gram penalties with care. For instance, penalizing 2-grams (bigrams) in an article about the city of New York
+    might lead to undesirable outcomes where the city's name appears only once in the entire text.
+    [Reference](https://huggingface.co/blog/how-to-generate)
+
+    </Tip>
+
+    Args:
+        ngram_size (`int`):
+            All ngrams of size `ngram_size` can only occur once.
+
+    Examples:
+
+    ```py
+    >>> from transformers import AutoTokenizer, AutoModelForCausalLM
+
+    >>> model = AutoModelForCausalLM.from_pretrained("distilbert/distilgpt2")
+    >>> tokenizer = AutoTokenizer.from_pretrained("distilbert/distilgpt2")
+    >>> inputs = tokenizer(["Today I"], return_tensors="pt")
+
+    >>> output = model.generate(**inputs)
+    >>> print(tokenizer.decode(output[0], skip_special_tokens=True))
+    Today I’m not sure if I’m going to be able to do it.
+
+    >>> # Now let's add ngram size using `no_repeat_ngram_size`. This stops the repetitions ("I’m") in the output.
+    >>> output = model.generate(**inputs, no_repeat_ngram_size=2)
+    >>> print(tokenizer.decode(output[0], skip_special_tokens=True))
+    Today I’m not sure if I can get a better understanding of the nature of this issue
+    ```
+    """
+
+    def __init__(self, ngram_size: int):
+        if not isinstance(ngram_size, int) or ngram_size <= 0:
+            raise ValueError(f"`ngram_size` has to be a strictly positive integer, but is {ngram_size}")
+        self.ngram_size = ngram_size
+
+    @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+        num_batch_hypotheses = scores.shape[0]
+        cur_len = input_ids.shape[-1]
+        scores_processed = scores.clone()
+        banned_batch_tokens = _calc_banned_ngram_tokens(self.ngram_size, input_ids, num_batch_hypotheses, cur_len)
+        for i, banned_tokens in enumerate(banned_batch_tokens):
+            scores_processed[i, banned_tokens] = -float("inf")
+
+        return scores_processed
+
+
+class EncoderNoRepeatNGramLogitsProcessor(LogitsProcessor):
+    r"""
+    [`LogitsProcessor`] that works similarly to [`NoRepeatNGramLogitsProcessor`], but applied exclusively to prevent
+    the repetition of n-grams present in the prompt.
+
+    It was designed to promote chattiness in a language model, by preventing the generation of n-grams present in
+    previous conversation rounds.
+
+    Args:
+        encoder_ngram_size (`int`):
+            All ngrams of size `ngram_size` can only occur within the encoder input ids.
+        encoder_input_ids (`int`):
+            The encoder_input_ids that should not be repeated within the decoder ids.
+
+    Examples:
+
+    ```py
+    >>> from transformers import AutoTokenizer, AutoModelForCausalLM
+
+    >>> model = AutoModelForCausalLM.from_pretrained("bigscience/bloomz-560m")
+    >>> tokenizer = AutoTokenizer.from_pretrained("bigscience/bloomz-560m")
+
+    >>> inputs = tokenizer("Alice: I love cats. What do you love?\nBob:", return_tensors="pt")
+
+    >>> # With greedy decoding, we see Bob repeating Alice's opinion. If Bob was a chatbot, it would be a poor one.
+    >>> outputs = model.generate(**inputs)
+    >>> print(tokenizer.batch_decode(outputs, skip_special_tokens=True)[0])
+    Alice: I love cats. What do you love?
+    Bob: I love cats. What do you
+
+    >>> # With this logits processor, we can prevent Bob from repeating Alice's opinion.
+    >>> outputs = model.generate(**inputs, encoder_no_repeat_ngram_size=2)
+    >>> print(tokenizer.batch_decode(outputs, skip_special_tokens=True)[0])
+    Alice: I love cats. What do you love?
+    Bob: My cats are very cute.
+    ```
+    """
+
+    def __init__(self, encoder_ngram_size: int, encoder_input_ids: torch.LongTensor):
+        if not isinstance(encoder_ngram_size, int) or encoder_ngram_size <= 0:
+            raise ValueError(
+                f"`encoder_ngram_size` has to be a strictly positive integer, but is {encoder_ngram_size}"
+            )
+        self.ngram_size = encoder_ngram_size
+        if len(encoder_input_ids.shape) == 1:
+            encoder_input_ids = encoder_input_ids.unsqueeze(0)
+        self.batch_size = encoder_input_ids.shape[0]
+        self.generated_ngrams = _get_ngrams(encoder_ngram_size, encoder_input_ids, self.batch_size)
+
+    @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+        # B x num_beams
+        num_hypos = scores.shape[0]
+        num_beams = num_hypos // self.batch_size
+        cur_len = input_ids.shape[-1]
+        scores_processed = scores.clone()
+        banned_batch_tokens = [
+            _get_generated_ngrams(
+                self.generated_ngrams[hypo_idx // num_beams], input_ids[hypo_idx], self.ngram_size, cur_len
+            )
+            for hypo_idx in range(num_hypos)
+        ]
+
+        for i, banned_tokens in enumerate(banned_batch_tokens):
+            scores_processed[i, banned_tokens] = -float("inf")
+
+        return scores_processed
+
+
+class SequenceBiasLogitsProcessor(LogitsProcessor):
+    """
+    [`LogitsProcessor`] that applies an additive bias on sequences. The bias is applied to the last token of a sequence
+    when the next generated token can complete it. Consequently, to take the most of biasing sequences with more than
+    one token, consider using beam methods (to gracefully work around partially completed sequences that have a
+    negative bias) and applying the bias to their prefixes (to ensure the bias is applied earlier).
+
+    <Tip>
+
+    In order to get the token ids of the sequences that you want to bias, make sure to set `add_prefix_space=True` when
+    initializing the tokenizer, and use `tokenizer(bad_words, add_special_tokens=False).input_ids`. The
+    `add_prefix_space` argument is only supported for some slow tokenizers, as fast tokenizers' prefixing behaviours
+    come from `pre tokenizers`. Read more [here](https://huggingface.co/docs/tokenizers/api/pre-tokenizers).
+
+    </Tip>
+
+    Args:
+        sequence_bias (`List[List[Union[List[int], float]]]`):
+            List of lists that maps a sequence of tokens to its bias term (e.g. `[[[10, 45], -2.0],
+            [[64], -7.5]]`). Positive biases increase the odds of the
+            sequence being selected, while negative biases do the opposite. If a sequence has a length of 1, its bias
+            will always be applied. Otherwise, the bias will only be applied if the sequence in question is about to be
+            completed (in the token selection step after this processor is applied).
+
+    Examples:
+
+    ```python
+    >>> from transformers import AutoTokenizer, AutoModelForCausalLM
+
+    >>> model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2")
+    >>> tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
+    >>> inputs = tokenizer(["The full name of Donald is Donald"], return_tensors="pt")
+
+    >>> summary_ids = model.generate(inputs["input_ids"], max_new_tokens=4)
+    >>> print(tokenizer.batch_decode(summary_ids, skip_special_tokens=True)[0])
+    The full name of Donald is Donald J. Trump Jr
+
+    >>> # Now let's control generation through a bias. Please note that the tokenizer is initialized differently!
+    >>> tokenizer_with_prefix_space = AutoTokenizer.from_pretrained("openai-community/gpt2", add_prefix_space=True)
+
+
+    >>> def get_tokens(word):
+    ...     return tokenizer_with_prefix_space([word], add_special_tokens=False).input_ids[0]
+
+
+    >>> # If we add a negative bias without beam search, it may become "stuck" in a prefix without good continuations
+    >>> sequence_bias = [get_tokens("Trump"), -10.0]
+    >>> biased_ids = model.generate(inputs["input_ids"], max_new_tokens=4, sequence_bias=sequence_bias)
+    >>> print(tokenizer.batch_decode(biased_ids, skip_special_tokens=True)[0])
+    The full name of Donald is Donald J. Donald,
+
+    >>> biased_ids = model.generate(inputs["input_ids"], max_new_tokens=4, num_beams=4, sequence_bias=sequence_bias)
+    >>> print(tokenizer.batch_decode(biased_ids, skip_special_tokens=True)[0])
+    The full name of Donald is Donald Rumsfeld,
+
+    >>> # We can also add a positive bias to nudge the model towards specific tokens or continuations
+    >>> sequence_bias = [get_tokens("Donald Duck"), 10.0]
+    >>> biased_ids = model.generate(inputs["input_ids"], max_new_tokens=4, num_beams=4, sequence_bias=sequence_bias)
+    >>> print(tokenizer.batch_decode(biased_ids, skip_special_tokens=True)[0])
+    The full name of Donald is Donald Duck.
+    ```
+    """
+
+    def __init__(self, sequence_bias: List[List[Union[List[int], float]]]):
+        self.sequence_bias = sequence_bias
+        self._validate_arguments()
+        self._convert_list_arguments_into_dict()
+
+        # Bias variables that will be populated on the first call (for retrocompatibility purposes, the vocabulary size
+        # is infered in the first usage, which inhibits initializing here)
+        self.length_1_bias = None
+        self.prepared_bias_variables = False
+
+    @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+        # 1 - Prepares the bias tensors. This is only needed the first time the logit processor is called.
+        if not self.prepared_bias_variables:
+            self._prepare_bias_variables(scores)
+
+        # 2 - prepares an empty bias to add
+        bias = torch.zeros_like(scores)
+
+        # 3 - include the bias from length = 1
+        bias += self.length_1_bias
+
+        # 4 - include the bias from length > 1, after determining which biased sequences may be completed.
+        for sequence_ids, sequence_bias in self.sequence_bias.items():
+            if len(sequence_ids) == 1:  # the sequence is of length 1, already applied
+                continue
+            if len(sequence_ids) > input_ids.shape[1]:  # the sequence is longer than the context, ignore
+                continue
+            prefix_length = len(sequence_ids) - 1
+            last_token = sequence_ids[-1]
+            matching_rows = torch.eq(
+                input_ids[:, -prefix_length:],
+                torch.tensor(sequence_ids[:-1], dtype=input_ids.dtype, device=input_ids.device),
+            ).prod(dim=1)
+            bias[:, last_token] += torch.where(
+                matching_rows.bool(),
+                torch.tensor(sequence_bias, device=input_ids.device),
+                torch.tensor(0.0, device=input_ids.device),
+            )
+
+        # 5 - apply the bias to the scores
+        scores_processed = scores + bias
+        return scores_processed
+
+    def _prepare_bias_variables(self, scores: torch.FloatTensor):
+        vocabulary_size = scores.shape[-1]
+
+        # Check biased tokens out of bounds
+        invalid_biases = []
+        for sequence_ids in self.sequence_bias:
+            for token_id in sequence_ids:
+                if token_id >= vocabulary_size:
+                    invalid_biases.append(token_id)
+        if len(invalid_biases) > 0:
+            raise ValueError(
+                f"The model vocabulary size is {vocabulary_size}, but the following tokens were being biased: "
+                f"{invalid_biases}"
+            )
+
+        # Precompute the bias tensors to be applied. Sequences of length 1 are kept separately, as they can be applied
+        # with simpler logic.
+        self.length_1_bias = torch.zeros((vocabulary_size,), dtype=torch.float).to(scores.device)
+        for sequence_ids, bias in self.sequence_bias.items():
+            if len(sequence_ids) == 1:
+                self.length_1_bias[sequence_ids[-1]] = bias
+
+        self.prepared_bias_variables = True
+
+    def _validate_arguments(self):
+        sequence_bias = self.sequence_bias
+        if not isinstance(sequence_bias, dict) and not isinstance(sequence_bias, list) or len(sequence_bias) == 0:
+            raise ValueError(
+                f"`sequence_bias` has to be a non-empty dictionary, or non-empty list of lists but is {sequence_bias}."
+            )
+        if isinstance(sequence_bias, dict) and any(
+            not isinstance(sequence_ids, tuple) for sequence_ids in sequence_bias.keys()
+        ):
+            raise ValueError(f"`sequence_bias` has to be a dict with tuples as keys, but is {sequence_bias}.")
+        if isinstance(sequence_bias, dict) and any(
+            any((not isinstance(token_id, (int, np.integer)) or token_id < 0) for token_id in sequence_ids)
+            or len(sequence_ids) == 0
+            for sequence_ids in sequence_bias.keys()
+        ):
+            raise ValueError(
+                f"Each key in `sequence_bias` has to be a non-empty tuple of positive integers, but is "
+                f"{sequence_bias}."
+            )
+
+        def all_token_bias_pairs_are_valid(sequence):
+            return (
+                isinstance(sequence[0], list)
+                and all(isinstance(token_id, (int, np.integer)) and token_id > 0 for token_id in sequence[0])
+                and isinstance(sequence[1], float)
+            )
+
+        if isinstance(sequence_bias, list) and any(
+            (not all_token_bias_pairs_are_valid(sequence)) or len(sequence) == 0 for sequence in sequence_bias
+        ):
+            raise ValueError(
+                f"Each element in `sequence_bias` has to be a non-empty list of lists of positive integers and float, but is "
+                f"{sequence_bias}."
+            )
+        if isinstance(sequence_bias, dict) and any(not isinstance(bias, float) for bias in sequence_bias.values()):
+            raise ValueError(f"`sequence_bias` has to be a dict with floats as values, but is {sequence_bias}.")
+
+    def _convert_list_arguments_into_dict(self):
+        """BC: we used to accept `dict{tuple of tokens: float}` directly, now we expect a list"""
+        if isinstance(self.sequence_bias, list):
+            temp_sequence = self.sequence_bias
+            self.sequence_bias = {tuple(sublist[0]): sublist[1] for sublist in temp_sequence}
+
+
+class NoBadWordsLogitsProcessor(SequenceBiasLogitsProcessor):
+    """
+    [`LogitsProcessor`] that enforces that specified sequences will never be selected.
+
+    <Tip>
+
+    In order to get the token ids of the words that should not appear in the generated text, make sure to set
+    `add_prefix_space=True` when initializing the tokenizer, and use `tokenizer(bad_words,
+    add_special_tokens=False).input_ids`. The `add_prefix_space` argument is only supported for some slow tokenizers,
+    as fast tokenizers' prefixing behaviours come from `pre tokenizers`. Read more
+    [here](https://huggingface.co/docs/tokenizers/api/pre-tokenizers).
+
+    </Tip>
+
+    Args:
+        bad_words_ids (`List[List[int]]`):
+            List of list of token ids that are not allowed to be generated.
+        eos_token_id (`Union[int, List[int], torch.Tensor]`, *optional*):
+            The id(s) of the *end-of-sequence* token.
+
+    Examples:
+
+    ```python
+    >>> from transformers import AutoTokenizer, AutoModelForCausalLM
+
+    >>> model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2")
+    >>> tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
+    >>> inputs = tokenizer(["In a word, the cake is a"], return_tensors="pt")
+
+    >>> output_ids = model.generate(inputs["input_ids"], max_new_tokens=5, pad_token_id=tokenizer.eos_token_id)
+    >>> print(tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0])
+    In a word, the cake is a bit of a mess.
+
+    >>> # Now let's take the bad words out. Please note that the tokenizer is initialized differently
+    >>> tokenizer_with_prefix_space = AutoTokenizer.from_pretrained("openai-community/gpt2", add_prefix_space=True)
+
+
+    >>> def get_tokens_as_list(word_list):
+    ...     "Converts a sequence of words into a list of tokens"
+    ...     tokens_list = []
+    ...     for word in word_list:
+    ...         tokenized_word = tokenizer_with_prefix_space([word], add_special_tokens=False).input_ids[0]
+    ...         tokens_list.append(tokenized_word)
+    ...     return tokens_list
+
+
+    >>> bad_words_ids = get_tokens_as_list(word_list=["mess"])
+    >>> output_ids = model.generate(
+    ...     inputs["input_ids"], max_new_tokens=5, bad_words_ids=bad_words_ids, pad_token_id=tokenizer.eos_token_id
+    ... )
+    >>> print(tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0])
+    In a word, the cake is a bit of a surprise.
+    ```
+    """
+
+    def __init__(
+        self, bad_words_ids: List[List[int]], eos_token_id: Optional[Union[int, List[int], torch.Tensor]] = None
+    ):
+        self.bad_word_ids = bad_words_ids
+        self._validate_arguments()
+
+        # Filter EOS token from bad_words_ids
+        if eos_token_id is not None:
+            if not isinstance(eos_token_id, torch.Tensor):
+                if isinstance(eos_token_id, int):
+                    eos_token_id = [eos_token_id]
+                eos_token_id = torch.tensor(eos_token_id)
+
+            bad_words_ids = list(
+                filter(lambda bad_token_seq: all(bad_token_seq != [i] for i in eos_token_id), bad_words_ids)
+            )
+
+        # Forbidding a sequence is equivalent to setting its bias to -inf
+        sequence_bias = {tuple(sequence): float("-inf") for sequence in bad_words_ids}
+        super().__init__(sequence_bias=sequence_bias)
+
+    def _validate_arguments(self):
+        bad_words_ids = self.bad_word_ids
+        if not isinstance(bad_words_ids, list) or len(bad_words_ids) == 0:
+            raise ValueError(f"`bad_words_ids` has to be a non-empty list, but is {bad_words_ids}.")
+        if any(not isinstance(bad_word_ids, list) for bad_word_ids in bad_words_ids):
+            raise ValueError(f"`bad_words_ids` has to be a list of lists, but is {bad_words_ids}.")
+        if any(
+            any((not isinstance(token_id, (int, np.integer)) or token_id < 0) for token_id in bad_word_ids)
+            for bad_word_ids in bad_words_ids
+        ):
+            raise ValueError(
+                f"Each list in `bad_words_ids` has to be a list of positive integers, but is {bad_words_ids}."
+            )
+
+
+class PrefixConstrainedLogitsProcessor(LogitsProcessor):
+    r"""
+    [`LogitsProcessor`] that enforces constrained generation and is useful for prefix-conditioned constrained
+    generation. See [Autoregressive Entity Retrieval](https://arxiv.org/abs/2010.00904) for more information.
+
+    Args:
+        prefix_allowed_tokens_fn (`Callable[[int, torch.Tensor], List[int]]`):
+            This function constraints the beam search to allowed tokens only at each step. This function takes 2
+            arguments `inputs_ids` and the batch ID `batch_id`. It has to return a list with the allowed tokens for the
+            next generation step conditioned on the previously generated tokens `inputs_ids` and the batch ID
+            `batch_id`.
+
+    Examples:
+
+    ```py
+    >>> from transformers import AutoTokenizer, AutoModelForCausalLM
+
+    >>> model = AutoModelForCausalLM.from_pretrained("bigscience/bloomz-560m")
+    >>> tokenizer = AutoTokenizer.from_pretrained("bigscience/bloomz-560m")
+
+    >>> inputs = tokenizer("Alice and Bob", return_tensors="pt")
+
+    >>> # By default, it continues generating according to the model's logits
+    >>> outputs = model.generate(**inputs, max_new_tokens=5)
+    >>> print(tokenizer.batch_decode(outputs, skip_special_tokens=True)[0])
+    Alice and Bob are friends
+
+    >>> # We can contrain it with `prefix_allowed_tokens_fn` to force a certain behavior based on a prefix.
+    >>> # For instance, we can force an entire entity to be generated when its beginning is detected.
+    >>> entity = tokenizer(" Bob Marley", return_tensors="pt").input_ids[0]  # 3 tokens
+    >>> def prefix_allowed_tokens_fn(batch_id, input_ids):
+    ...     '''
+    ...     Attempts to generate 'Bob Marley' when 'Bob' is detected.
+    ...     In this case, `batch_id` is not used, but you can set rules for each batch member.
+    ...     '''
+    ...     if input_ids[-1] == entity[0]:
+    ...         return [entity[1].item()]
+    ...     elif input_ids[-2] == entity[0] and input_ids[-1] == entity[1]:
+    ...         return [entity[2].item()]
+    ...     return list(range(tokenizer.vocab_size))  # If no match, allow all tokens
+
+    >>> outputs = model.generate(**inputs, max_new_tokens=5, prefix_allowed_tokens_fn=prefix_allowed_tokens_fn)
+    >>> print(tokenizer.batch_decode(outputs, skip_special_tokens=True)[0])
+    Alice and Bob Marley
+    ```
+    """
+
+    def __init__(self, prefix_allowed_tokens_fn: Callable[[int, torch.Tensor], List[int]], num_beams: int):
+        self._prefix_allowed_tokens_fn = prefix_allowed_tokens_fn
+        self._num_beams = num_beams
+
+    @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+        mask = torch.full_like(scores, -math.inf)
+        for batch_id, beam_sent in enumerate(input_ids.view(-1, self._num_beams, input_ids.shape[-1])):
+            for beam_id, sent in enumerate(beam_sent):
+                prefix_allowed_tokens = self._prefix_allowed_tokens_fn(batch_id, sent)
+                if len(prefix_allowed_tokens) == 0:
+                    raise ValueError(
+                        f"`prefix_allowed_tokens_fn` returned an empty list for batch ID {batch_id}."
+                        f"This means that the constraint is unsatisfiable. Please check your implementation"
+                        f"of `prefix_allowed_tokens_fn` "
+                    )
+                mask[batch_id * self._num_beams + beam_id, prefix_allowed_tokens] = 0
+
+        scores_processed = scores + mask
+        return scores_processed
+
+
+class HammingDiversityLogitsProcessor(LogitsProcessor):
+    r"""
+    [`LogitsProcessor`] that enforces diverse beam search.
+
+    Note that this logits processor is only effective for [`PreTrainedModel.group_beam_search`]. See [Diverse Beam
+    Search: Decoding Diverse Solutions from Neural Sequence Models](https://arxiv.org/pdf/1610.02424.pdf) for more
+    details.
+
+    Traditional beam search often generates very similar sequences across different beams.
+    `HammingDiversityLogitsProcessor` addresses this by penalizing beams that generate tokens already chosen by other
+    beams in the same time step.
+
+    Args:
+        diversity_penalty (`float`):
+            This value is subtracted from a beam's score if it generates a token same as any beam from other group at a
+            particular time. A higher `diversity_penalty` will enforce greater diversity among the beams. Adjusting
+            this value can help strike a balance between diversity and natural likelihood.
+        num_beams (`int`):
+            Number of beams for beam search. 1 means no beam search.
+        num_beam_groups (`int`):
+            Number of groups to divide `num_beams` into in order to ensure diversity among different groups of beams.
+            [this paper](https://arxiv.org/pdf/1610.02424.pdf) for more details.
+
+    Examples:
+
+    ```python
+    >>> from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+    >>> import torch
+
+    >>> # Initialize the model and tokenizer
+    >>> tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-base")
+    >>> model = AutoModelForSeq2SeqLM.from_pretrained("google-t5/t5-base")
+
+    >>> # A long text about the solar system
+    >>> text = (
+    ...     "The Solar System is a gravitationally bound system comprising the Sun and the objects that orbit it, "
+    ...     "either directly or indirectly. Of the objects that orbit the Sun directly, the largest are the eight "
+    ...     "planets, with the remainder being smaller objects, such as the five dwarf planets and small Solar System "
+    ...     "bodies. The Solar System formed 4.6 billion years ago from the gravitational collapse of a giant "
+    ...     "interstellar molecular cloud."
+    ... )
+    >>> inputs = tokenizer("summarize: " + text, return_tensors="pt")
+
+    >>> # Generate diverse summary
+    >>> outputs_diverse = model.generate(
+    ...     **inputs,
+    ...     num_beam_groups=2,
+    ...     diversity_penalty=10.0,
+    ...     max_length=100,
+    ...     num_beams=4,
+    ...     num_return_sequences=2,
+    ... )
+    >>> summaries_diverse = tokenizer.batch_decode(outputs_diverse, skip_special_tokens=True)
+
+    >>> # Generate non-diverse summary
+    >>> outputs_non_diverse = model.generate(
+    ...     **inputs,
+    ...     max_length=100,
+    ...     num_beams=4,
+    ...     num_return_sequences=2,
+    ... )
+    >>> summary_non_diverse = tokenizer.batch_decode(outputs_non_diverse, skip_special_tokens=True)
+
+    >>> # With `diversity_penalty`, the resulting beams are much more diverse
+    >>> print(summary_non_diverse)
+    ['the solar system formed 4.6 billion years ago from the collapse of a giant interstellar molecular cloud. of the objects that orbit the Sun directly, the largest are the eight planets.',
+    'the Solar System formed 4.6 billion years ago from the collapse of a giant interstellar molecular cloud. of the objects that orbit the Sun directly, the largest are the eight planets.']
+
+    >>> print(summaries_diverse)
+    ['the solar system formed 4.6 billion years ago from the collapse of a giant interstellar molecular cloud. of the objects that orbit the Sun directly, the largest are the eight planets.',
+    'the solar system formed 4.6 billion years ago from the collapse of a giant interstellar molecular cloud. of the objects that orbit the Sun directly, the largest are the eight planets. the rest of the objects are smaller objects, such as the five dwarf planets and small solar system bodies.']
+    ```
+    """
+
+    def __init__(self, diversity_penalty: float, num_beams: int, num_beam_groups: int):
+        if not isinstance(diversity_penalty, float) or (not diversity_penalty > 0.0):
+            raise ValueError("`diversity_penalty` should be a float strictly larger than 0.")
+        self._diversity_penalty = diversity_penalty
+        if not isinstance(num_beams, int) or num_beams < 2:
+            raise ValueError("`num_beams` should be an integer strictly larger than 1.")
+        self._num_beams = num_beams
+        if not isinstance(num_beam_groups, int) or num_beam_groups < 2:
+            raise ValueError("`num_beam_groups` should be an integer strictly larger than 1.")
+        if num_beam_groups > num_beams:
+            raise ValueError("`beam_groups` has to be smaller or equal to `num_beams`.")
+        self._num_sub_beams = num_beams // num_beam_groups
+
+    def __call__(
+        self,
+        input_ids: torch.LongTensor,
+        scores: torch.FloatTensor,
+        current_tokens: torch.LongTensor,
+        beam_group_idx: int,
+    ) -> torch.FloatTensor:
+        r"""
+        Args:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary. [What are input IDs?](../glossary#input-ids)
+            scores (`torch.FloatTensor` of shape `(batch_size, config.vocab_size)`):
+                Prediction scores of a language modeling head. These can be logits for each vocabulary when not using
+                beam search or log softmax for each vocabulary token when using beam search
+            current_tokens (`torch.LongTensor` of shape `(batch_size)`):
+                Indices of input sequence tokens in the vocabulary, corresponding to the tokens selected by the other
+                beam groups in the current generation step.
+            beam_group_idx (`int`):
+                The index of the beam group currently being processed.
+
+        Return:
+            `torch.FloatTensor` of shape `(batch_size, config.vocab_size)`:
+                The processed prediction scores.
+        """
+        # hamming diversity: penalise using same token in current group which was used in previous groups at
+        # the same time step
+        batch_size = current_tokens.shape[0] // self._num_beams
+        group_start_idx = beam_group_idx * self._num_sub_beams
+        group_end_idx = min(group_start_idx + self._num_sub_beams, self._num_beams)
+        group_size = group_end_idx - group_start_idx
+        vocab_size = scores.shape[-1]
+
+        if group_start_idx == 0:
+            return scores
+
+        scores_processed = scores.clone()
+        for batch_idx in range(batch_size):
+            # predicted tokens of last time step of previous groups
+            previous_group_tokens = current_tokens[
+                batch_idx * self._num_beams : batch_idx * self._num_beams + group_start_idx
+            ]
+            token_frequency = torch.bincount(previous_group_tokens, minlength=vocab_size).to(scores.device)
+            scores_processed[batch_idx * group_size : (batch_idx + 1) * group_size] -= (
+                self._diversity_penalty * token_frequency
+            )
+
+        return scores_processed
+
+
+class ForcedBOSTokenLogitsProcessor(LogitsProcessor):
+    r"""
+    [`LogitsProcessor`] that enforces the specified token as the first generated token. Used with encoder-decoder
+    models.
+
+    Args:
+        bos_token_id (`int`):
+            The id of the token to force as the first generated token.
+
+    Examples:
+
+    ```python
+    >>> from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+
+    >>> model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-small")
+    >>> tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-small")
+
+    >>> inputs = tokenizer("Translate from English to German: I love cats.", return_tensors="pt")
+
+    >>> # By default, it continues generating according to the model's logits
+    >>> outputs = model.generate(**inputs, max_new_tokens=10)
+    >>> print(tokenizer.batch_decode(outputs)[0])
+    <pad> Ich liebe Kitty.</s>
+
+    >>> # We can use `forced_bos_token_id` to force the start of generation with an encoder-decoder model
+    >>> # (including forcing it to end straight away with an EOS token)
+    >>> outputs = model.generate(**inputs, max_new_tokens=10, forced_bos_token_id=tokenizer.eos_token_id)
+    >>> print(tokenizer.batch_decode(outputs)[0])
+    <pad></s>
+    ```
+    """
+
+    def __init__(self, bos_token_id: int):
+        self.bos_token_id = bos_token_id
+
+    @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+        cur_len = input_ids.shape[-1]
+        scores_processed = scores
+        if cur_len == 1:
+            scores_processed = torch.full_like(scores, -math.inf)
+            scores_processed[:, self.bos_token_id] = 0
+        return scores_processed
+
+
+class ForcedEOSTokenLogitsProcessor(LogitsProcessor):
+    r"""
+    [`LogitsProcessor`] that enforces the specified token as the last generated token when `max_length` is reached.
+
+    Args:
+        max_length (`int`):
+            The maximum length of the sequence to be generated.
+        eos_token_id (`Union[int, List[int], torch.Tensor]`):
+            The id(s) of the *end-of-sequence* token.
+        device (`str`, *optional*, defaults to `"cpu"`):
+            The device to allocate the tensors.
+
+    Examples:
+
+    ```python
+    >>> from transformers import AutoTokenizer, AutoModelForCausalLM
+
+    >>> model = AutoModelForCausalLM.from_pretrained("distilbert/distilgpt2")
+    >>> tokenizer = AutoTokenizer.from_pretrained("distilbert/distilgpt2")
+
+    >>> inputs = tokenizer("A sequence: 1, 2, 3", return_tensors="pt")
+
+    >>> # By default, it continues generating according to the model's logits
+    >>> outputs = model.generate(**inputs, max_new_tokens=10)
+    >>> print(tokenizer.batch_decode(outputs)[0])
+    A sequence: 1, 2, 3, 4, 5, 6, 7, 8
+
+    >>> # `forced_eos_token_id` ensures the generation ends with a EOS token
+    >>> outputs = model.generate(**inputs, max_new_tokens=10, forced_eos_token_id=tokenizer.eos_token_id)
+    >>> print(tokenizer.batch_decode(outputs)[0])
+    A sequence: 1, 2, 3, 4, 5, 6, 7,<|endoftext|>
+    ```
+    """
+
+    def __init__(self, max_length: int, eos_token_id: Union[int, List[int], torch.Tensor], device: str = "cpu"):
+        self.max_length = max_length
+
+        if not isinstance(eos_token_id, torch.Tensor):
+            if isinstance(eos_token_id, int):
+                eos_token_id = [eos_token_id]
+            eos_token_id = torch.tensor(eos_token_id, device=device)
+        self.eos_token_id = eos_token_id
+
+        if torch.is_floating_point(eos_token_id) or (eos_token_id < 0).any():
+            raise ValueError(f"`eos_token_id` has to be a list of positive integers, but is {eos_token_id}")
+
+    @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+        cur_len = input_ids.shape[-1]
+        scores_processed = scores
+        if cur_len == self.max_length - 1:
+            scores_processed = torch.full_like(scores, -math.inf)
+            scores_processed[:, self.eos_token_id] = 0
+        return scores_processed
+
+
+class InfNanRemoveLogitsProcessor(LogitsProcessor):
+    r"""
+    [`LogitsProcessor`] that removes all `nan` and `inf` values to avoid the generation method to fail. Note that using
+    the logits processor should only be used if necessary since it can slow down the generation method.
+
+    This logits processor has no `generate` example, as there shouldn't be a correct combination of flags that warrants
+    its use.
+    """
+
+    @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+        # set all nan values to 0.0
+        scores_processed = torch.where(scores != scores, 0.0, scores)
+
+        # set all +/-inf values to max/min possible value
+        scores_processed = torch.where(scores == float("inf"), torch.finfo(scores.dtype).max, scores_processed)
+        scores_processed = torch.where(scores == -float("inf"), torch.finfo(scores.dtype).min, scores_processed)
+
+        return scores_processed
+
+
+class ExponentialDecayLengthPenalty(LogitsProcessor):
+    r"""
+    [`LogitsProcessor`] that exponentially increases the score of the `eos_token_id` after `start_index` has been
+    reached. This allows generating shorter sequences without having a hard cutoff, allowing the `eos_token` to be
+    predicted in a meaningful position.
+
+    Args:
+        exponential_decay_length_penalty (`tuple(int, float)`):
+            This tuple shall consist of: `(start_index, decay_factor)` where `start_index` indicates where penalty
+            starts and `decay_factor` represents the factor of exponential decay
+        eos_token_id (`Union[int, List[int], torch.Tensor]`):
+            The id(s) of the *end-of-sequence* token.
+        input_ids_seq_length (`int`):
+            The length of the input sequence.
+
+    Examples:
+
+    ```python
+    >>> from transformers import AutoTokenizer, AutoModelForCausalLM, set_seed
+
+    >>> model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2")
+    >>> tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
+
+    >>> text = "Just wanted to let you know, I"
+    >>> inputs = tokenizer(text, return_tensors="pt")
+
+    >>> # Let's consider that we want short sentences, so we limit `max_length=30`. However, we observe that the answer
+    >>> # tends to end abruptly.
+    >>> set_seed(1)
+    >>> outputs = model.generate(**inputs, do_sample=True, temperature=0.9, max_length=30, pad_token_id=50256)
+    >>> print(tokenizer.batch_decode(outputs)[0])
+    Just wanted to let you know, I received a link to an ebook, the book How To Start A Social Network which was
+    published in 2010. Although
+
+    >>> # To promote the appearance of the EOS token at the right time, we add the `exponential_decay_length_penalty =
+    >>> # (start_index, decay_factor)`. Instead of cutting at max_tokens, the output comes to an end before and usually
+    >>> # with more meaning. What happens is that starting from `start_index` the EOS token score will be increased
+    >>> # by `decay_factor` exponentially. However, if you set a high decay factor, you may also end up with abruptly
+    >>> # ending sequences.
+    >>> set_seed(1)
+    >>> outputs = model.generate(
+    ...     **inputs,
+    ...     do_sample=True,
+    ...     temperature=0.9,
+    ...     max_length=30,
+    ...     pad_token_id=50256,
+    ...     exponential_decay_length_penalty=(15, 1.6),
+    ... )
+    >>> print(tokenizer.batch_decode(outputs)[0])
+    Just wanted to let you know, I received a link to an ebook, the book How To Start A Social Network
+    which<|endoftext|>
+
+    >>> # With a small decay factor, you will have a higher chance of getting a meaningful sequence.
+    >>> set_seed(1)
+    >>> outputs = model.generate(
+    ...     **inputs,
+    ...     do_sample=True,
+    ...     temperature=0.9,
+    ...     max_length=30,
+    ...     pad_token_id=50256,
+    ...     exponential_decay_length_penalty=(15, 1.01),
+    ... )
+    >>> print(tokenizer.batch_decode(outputs)[0])
+    Just wanted to let you know, I received a link to an ebook, the book How To Start A Social Network which was
+    published in 2010.<|endoftext|>
+    ```
+    """
+
+    def __init__(
+        self,
+        exponential_decay_length_penalty: Tuple[int, float],
+        eos_token_id: Union[int, List[int], torch.Tensor],
+        input_ids_seq_length: int,
+    ):
+        self.regulation_start = exponential_decay_length_penalty[0] + input_ids_seq_length
+        self.regulation_factor = exponential_decay_length_penalty[1]
+
+        if not isinstance(eos_token_id, torch.Tensor):
+            if isinstance(eos_token_id, int):
+                eos_token_id = [eos_token_id]
+            eos_token_id = torch.tensor(eos_token_id)
+        self.eos_token_id = eos_token_id
+
+        if torch.is_floating_point(eos_token_id) or (eos_token_id < 0).any():
+            raise ValueError(f"`eos_token_id` has to be a list of positive integers, but is {eos_token_id}")
+
+    @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+        cur_len = input_ids.shape[-1]
+        self.eos_token_id = self.eos_token_id.to(scores.device)
+        penalties = torch.zeros_like(scores)
+        scores_processed = scores
+        if cur_len > self.regulation_start:
+            penalty_idx = cur_len - self.regulation_start
+            # To support negative logits we compute the penalty of the absolute value and add to the original logit
+            penalty = torch.abs(scores[:, self.eos_token_id]) * (pow(self.regulation_factor, penalty_idx) - 1)
+            penalties[:, self.eos_token_id] = penalty
+            scores_processed = scores + penalties
+        return scores_processed
+
+
+class LogitNormalization(LogitsProcessor):
+    r"""
+    [`LogitsProcessor`] for normalizing the scores using log-softmax. It's important to normalize
+    the scores during beam search, after applying the logits processors or warpers, since the search algorithm used in
+    this library doesn't do it (it only does it before, but they may need re-normalization) but it still supposes that
+    the scores are normalized when comparing the hypotheses.
+
+    Examples:
+
+    ```python
+    >>> from transformers import AutoTokenizer, AutoModelForCausalLM
+    >>> import torch
+
+    >>> model = AutoModelForCausalLM.from_pretrained("distilbert/distilgpt2")
+    >>> tokenizer = AutoTokenizer.from_pretrained("distilbert/distilgpt2")
+
+    >>> inputs = tokenizer("A sequence: 1, 2, 3", return_tensors="pt")
+
+    >>> # By default, the scores are not normalized -- the sum of their exponentials is NOT a normalized probability
+    >>> # distribution, summing to 1
+    >>> outputs = model.generate(**inputs, return_dict_in_generate=True, output_scores=True)
+    >>> print(torch.allclose(torch.sum(torch.exp(outputs.scores[-1])), torch.Tensor((1.000,)), rtol=1e-4))
+    False
+
+    >>> # Normalizing them may have a positive impact on beam methods, or when using the scores on your application
+    >>> outputs = model.generate(**inputs, renormalize_logits=True, return_dict_in_generate=True, output_scores=True)
+    >>> print(torch.allclose(torch.sum(torch.exp(outputs.scores[-1])), torch.Tensor((1.000,)), rtol=1e-4))
+    True
+    ```
+    """
+
+    @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+        scores_processed = scores.log_softmax(dim=-1)
+        return scores_processed
+
+
+class SuppressTokensAtBeginLogitsProcessor(LogitsProcessor):
+    r"""
+    [`SuppressTokensAtBeginLogitsProcessor`] supresses a list of tokens as soon as the `generate` function starts
+    generating using `begin_index` tokens. This should ensure that the tokens defined by `begin_suppress_tokens` are
+    not generated at the beginning. Originally created for
+    [Whisper](https://huggingface.co/docs/transformers/model_doc/whisper).
+
+    Examples:
+
+    ```python
+    >>> from transformers import AutoProcessor, WhisperForConditionalGeneration
+    >>> from datasets import load_dataset
+
+    >>> processor = AutoProcessor.from_pretrained("openai/whisper-tiny.en")
+    >>> model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
+    >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+    >>> inputs = processor(ds[0]["audio"]["array"], return_tensors="pt")
+
+    >>> # Whisper has `begin_suppress_tokens` set by default (= `[220, 50256]`). 50256 is the EOS token, so this means
+    >>> # it can't generate and EOS token in the first iteration, but it can in the others.
+    >>> outputs = model.generate(**inputs, return_dict_in_generate=True, output_scores=True)
+    >>> print(outputs.scores[0][0, 50256])
+    tensor(-inf)
+    >>> print(outputs.scores[-1][0, 50256])  # in other places we can see some probability mass for EOS
+    tensor(29.9010)
+
+    >>> # If we disable `begin_suppress_tokens`, we can generate EOS in the first iteration.
+    >>> outputs = model.generate(
+    ...     **inputs, return_dict_in_generate=True, output_scores=True, begin_suppress_tokens=None
+    ... )
+    >>> print(outputs.scores[0][0, 50256])
+    tensor(11.2027)
+    ```
+    """
+
+    def __init__(self, begin_suppress_tokens, begin_index, device: str = "cpu"):
+        self.begin_suppress_tokens = torch.tensor(list(begin_suppress_tokens), device=device)
+        self.begin_index = begin_index
+
+    def set_begin_index(self, begin_index):
+        self.begin_index = begin_index
+
+    @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+        vocab_tensor = torch.arange(scores.shape[-1], device=scores.device)
+        suppress_token_mask = isin_mps_friendly(vocab_tensor, self.begin_suppress_tokens)
+        scores_processed = scores
+        if input_ids.shape[-1] == self.begin_index:
+            scores_processed = torch.where(suppress_token_mask, -float("inf"), scores)
+
+        return scores_processed
+
+
+class SuppressTokensLogitsProcessor(LogitsProcessor):
+    r"""
+    This processor can be used to suppress a list of tokens. The processor will set their log probs to `-inf` so
+    that they are not generated. Originally created for
+    [Whisper](https://huggingface.co/docs/transformers/model_doc/whisper).
+
+    Examples:
+
+    ```python
+    >>> from transformers import AutoProcessor, WhisperForConditionalGeneration
+    >>> from datasets import load_dataset
+
+    >>> processor = AutoProcessor.from_pretrained("openai/whisper-tiny.en")
+    >>> model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
+    >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+    >>> inputs = processor(ds[0]["audio"]["array"], return_tensors="pt")
+
+    >>> # Whisper has a long list of suppressed tokens. For instance, in this case, the token 1 is suppressed by default.
+    >>> outputs = model.generate(**inputs, return_dict_in_generate=True, output_scores=True)
+    >>> print(outputs.scores[1][0, 1])  # 1 (and not 0) is the first freely generated token
+    tensor(-inf)
+
+    >>> # If we disable `suppress_tokens`, we can generate it.
+    >>> outputs = model.generate(**inputs, return_dict_in_generate=True, output_scores=True, suppress_tokens=None)
+    >>> print(outputs.scores[1][0, 1])
+    tensor(6.0678)
+    ```
+    """
+
+    def __init__(self, suppress_tokens, device: str = "cpu"):
+        self.suppress_tokens = torch.tensor(list(suppress_tokens), device=device)
+
+    @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+        vocab_tensor = torch.arange(scores.shape[-1], device=scores.device)
+        suppress_token_mask = isin_mps_friendly(vocab_tensor, self.suppress_tokens)
+        scores = torch.where(suppress_token_mask, -float("inf"), scores)
+        return scores
+
+
+class WhisperTimeStampLogitsProcessor(LogitsProcessor):
+    r"""
+
+    [`LogitsProcessor`] that modifies the logits for the generation of timestamps in the transcription. When the input
+    tokens are at a specific threshold, the processor sets the scores to negative infinity. The processor makes sure
+    that timestamp tokens appear in pairs, by masking out the logits that would break this pairing pattern. This is
+    done to maintain the consistency and structure of generated timestamps. It also ensures that when the predicted
+    probability of sampling any of the timestamp token is greater than any individual non-timestamp token, those
+    non-timestamp logits are set to negative infinity. This is done to ensure the generation of timestamps over other
+    potential tokens.
+
+
+    See [the paper](https://arxiv.org/abs/2212.04356) for more information.
+
+    Args:
+        generate_config (`GenerateConfig`):
+            The generate config used to generate the output. The following parameters are required:
+                eos_token_id (`int`, *optional*, defaults to 50257):
+                    The id of the *end-of-sequence* token.
+                no_timestamps_token_id (`int`, *optional*, defaults to 50363):
+                    The id of the `"<|notimestamps|>"` token.
+                max_initial_timestamp_index (`int`, *optional*, defaults to 1):
+                    Used to set the maximum value of the initial timestamp. This is used to prevent the model from
+                    predicting timestamps that are too far in the future.
+        begin_index (`Optional`, *optional*): Token index of the first token that is generated by the model.
+        _detect_timestamp_from_logprob (`bool`, *optional*): Whether timestamps can be predicted from logprobs over all timestamps.
+
+    Examples:
+    ``` python
+    >>> import torch
+    >>> from transformers import AutoProcessor, WhisperForConditionalGeneration, GenerationConfig
+    >>> from datasets import load_dataset
+
+    >>> processor = AutoProcessor.from_pretrained("openai/whisper-tiny.en")
+    >>> model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
+    >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+    >>> inputs = processor(ds[3]["audio"]["array"], return_tensors="pt")
+    >>> input_features = inputs.input_features
+
+    >>> #Displaying timestamps
+    >>> generated_ids = model.generate(inputs=input_features, return_timestamps=True)
+    >>> transcription = processor.batch_decode(generated_ids, decode_with_timestamps=True)[0]
+    >>> print("Transcription:", transcription)
+    Transcription: <|startoftranscript|><|0.00|> He has grave doubts whether Sir Frederick Layton's work is really Greek after all, and can<|6.44|><|6.44|> discover in it but little of rocky Ithaca.<|9.44|><|endoftext|>
+
+
+    >>> #No timestamps & change EOS:
+    >>> #This allows the user to select a specific token to terminate the sequence on, in this case it's the word "can"(460)
+    >>> model.generation_config.eos_token_id = 460
+    >>> generated_ids = model.generate(inputs=input_features,return_timestamps=False)
+    >>> transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+    >>> print("Transcription:", transcription)
+    Transcription:  He has grave doubts whether Sir Frederick Layton's work is really Greek after all and can
+    ```
+    """
+
+    def __init__(
+        self,
+        generate_config,
+        begin_index: Optional[int] = None,
+        _detect_timestamp_from_logprob: Optional[bool] = None,
+    ):  # support for the kwargs
+        self.no_timestamps_token_id = generate_config.no_timestamps_token_id
+        self.timestamp_begin = generate_config.no_timestamps_token_id + 1
+        self.eos_token_id = generate_config.eos_token_id or generate_config.bos_token_id
+
+        # this variable is mostly just used for testing
+        self._detect_timestamp_from_logprob = (
+            _detect_timestamp_from_logprob
+            if _detect_timestamp_from_logprob is not None
+            else getattr(generate_config, "_detect_timestamp_from_logprob", True)
+        )
+
+        num_forced_ids = (
+            len(generate_config.forced_decoder_ids) if generate_config.forced_decoder_ids is not None else 0
+        )
+        self.begin_index = begin_index or (num_forced_ids + 1)
+
+        self.max_initial_timestamp_index = getattr(generate_config, "max_initial_timestamp_index", None)
+        # TODO(Patrick): Make sure that official models have max_initial_timestamp_index set to 50
+        # self.max_initial_timestamp_index = 50
+
+    def set_begin_index(self, begin_index):
+        self.begin_index = begin_index
+
+    @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+        # suppress <|notimestamps|> which is handled by without_timestamps
+        scores_processed = scores.clone()
+        scores_processed[:, self.no_timestamps_token_id] = -float("inf")
+
+        # timestamps have to appear in pairs, except directly before eos_token; mask logits accordingly
+        for k in range(input_ids.shape[0]):
+            sampled_tokens = input_ids[k, self.begin_index :]
+            seq = list(sampled_tokens.tolist())
+
+            last_was_timestamp = len(seq) >= 1 and seq[-1] >= self.timestamp_begin
+            penultimate_was_timestamp = len(seq) < 2 or seq[-2] >= self.timestamp_begin
+
+            if last_was_timestamp:
+                if penultimate_was_timestamp:  # has to be non-timestamp
+                    scores_processed[k, self.timestamp_begin :] = -float("inf")
+                else:  # cannot be normal text tokens
+                    scores_processed[k, : self.eos_token_id] = -float("inf")
+
+            timestamps = sampled_tokens[sampled_tokens.ge(self.timestamp_begin)]
+            if timestamps.numel() > 0:
+                # `timestamps` shouldn't decrease; forbid timestamp tokens smaller than the last
+                # The following lines of code are copied from: https://github.com/openai/whisper/pull/914/files#r1137085090
+                if last_was_timestamp and not penultimate_was_timestamp:
+                    timestamp_last = timestamps[-1]
+                else:
+                    # Avoid to emit <|0.00|> again
+                    timestamp_last = timestamps[-1] + 1
+
+                scores_processed[k, self.timestamp_begin : timestamp_last] = -float("inf")
+
+        # apply the `max_initial_timestamp` option
+        if input_ids.shape[1] == self.begin_index:
+            scores_processed[:, : self.timestamp_begin] = -float("inf")
+
+            if self.max_initial_timestamp_index is not None:
+                last_allowed = self.timestamp_begin + self.max_initial_timestamp_index
+                scores_processed[:, last_allowed + 1 :] = -float("inf")
+
+        # if sum of probability over timestamps is above any other token, sample timestamp
+        logprobs = torch.nn.functional.log_softmax(scores_processed.float(), dim=-1)
+        for k in range(input_ids.shape[0]):
+            timestamp_logprob = logprobs[k, self.timestamp_begin :].logsumexp(dim=-1)
+            max_text_token_logprob = logprobs[k, : self.timestamp_begin].max()
+            if timestamp_logprob > max_text_token_logprob and self._detect_timestamp_from_logprob:
+                scores_processed[k, : self.timestamp_begin] = -float("inf")
+
+        return scores_processed
+
+
+class WhisperNoSpeechDetection(LogitsProcessor):
+    r"""This processor can be used to detect silence when using Whisper. It should take as input unprocessed logits to follow the original implementation"""
+
+    def __init__(self, no_speech_token: int, begin_index: int, scores_is_logprobs: bool = False):
+        self.no_speech_token = no_speech_token
+        # offset between <start-of-transcription> token, <SOT>, in paper and first generated token
+        # is equal to the position of the first generated token index
+        self.start_of_trans_offset = begin_index
+
+        # `self.begin_index` is a running value that is changed on the fly
+        self.begin_index = begin_index
+        self._no_speech_prob = [0.0]
+        self.is_scores_logprobs = scores_is_logprobs
+
+        # overwritten dynamically
+        self.model = None
+        self.inputs = None
+
+    def set_model(self, model):
+        self.model = model
+
+    def set_inputs(self, inputs):
+        self.inputs = {**self.model.prepare_inputs_for_generation(**inputs), **inputs}
+        self.inputs["input_features"] = self.inputs.pop("inputs")
+
+    @property
+    def no_speech_prob(self):
+        return self._no_speech_prob
+
+    def set_begin_index(self, begin_index):
+        self.begin_index = begin_index
+
+    @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+        is_scores_logprobs = self.is_scores_logprobs
+
+        if input_ids.shape[1] == self.begin_index:
+            if self.start_of_trans_offset > 1:
+                with torch.no_grad():
+                    logits = self.model(**self.inputs).logits
+
+                no_speech_index = self.begin_index - self.start_of_trans_offset
+                no_speech_scores = logits[:, no_speech_index]
+                is_scores_logprobs = False
+            else:
+                no_speech_scores = scores
+
+            if is_scores_logprobs:
+                probs = no_speech_scores.exp()
+            else:
+                probs = no_speech_scores.float().softmax(dim=-1)
+
+            self._no_speech_prob = probs[:, self.no_speech_token]
+
+        return scores
+
+
+class ClassifierFreeGuidanceLogitsProcessor(LogitsProcessor):
+    r"""
+    [`LogitsProcessor`] for classifier free guidance (CFG). The scores are split over the batch dimension,
+    where the first half correspond to the conditional logits (predicted from the input prompt) and the second half
+    correspond to the unconditional logits (predicted from an empty or 'null' prompt). The processor computes a
+    weighted average across the conditional and unconditional logits, parameterised by the `guidance_scale`.
+
+    See [the paper](https://arxiv.org/abs/2306.05284) for more information.
+
+    <Tip warning={true}>
+
+    This logits processor is exclusively compatible with
+    [MusicGen](https://huggingface.co/docs/transformers/main/en/model_doc/musicgen)
+
+    </Tip>
+
+    Args:
+        guidance_scale (float):
+            The guidance scale for classifier free guidance (CFG). CFG is enabled by setting `guidance_scale > 1`.
+            Higher guidance scale encourages the model to generate samples that are more closely linked to the input
+            prompt, usually at the expense of poorer quality.
+
+    Examples:
+
+    ```python
+    >>> from transformers import AutoProcessor, MusicgenForConditionalGeneration
+
+    >>> processor = AutoProcessor.from_pretrained("facebook/musicgen-small")
+    >>> model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")
+
+    >>> inputs = processor(
+    ...     text=["80s pop track with bassy drums and synth", "90s rock song with loud guitars and heavy drums"],
+    ...     padding=True,
+    ...     return_tensors="pt",
+    ... )
+    >>> audio_values = model.generate(**inputs, do_sample=True, guidance_scale=3, max_new_tokens=256)
+    ```
+    """
+
+    def __init__(self, guidance_scale):
+        if guidance_scale > 1:
+            self.guidance_scale = guidance_scale
+        else:
+            raise ValueError(
+                "Require guidance scale >1 to use the classifier free guidance processor, got guidance scale "
+                f"{guidance_scale}."
+            )
+
+    @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+        # simple check to make sure we have compatible batch sizes between our
+        # logits scores (cond + uncond) and input ids (cond only)
+        if scores.shape[0] != 2 * input_ids.shape[0]:
+            raise ValueError(
+                f"Logits should have twice the batch size of the input ids, the first half of batches corresponding to "
+                f"the conditional inputs, and the second half of batches corresponding to the unconditional inputs. Got "
+                f"batch size {scores.shape[0]} for the logits and {input_ids.shape[0]} for the input ids."
+            )
+        unguided_bsz = scores.shape[0] // 2
+        cond_logits, uncond_logits = scores.split(unguided_bsz, dim=0)
+        scores_processed = uncond_logits + (cond_logits - uncond_logits) * self.guidance_scale
+        return scores_processed
+
+
+class AlternatingCodebooksLogitsProcessor(LogitsProcessor):
+    r"""
+    [`LogitsProcessor`] enforcing alternated generation between the two codebooks of Bark.
+
+    <Tip warning={true}>
+
+    This logits processor is exclusively compatible with
+    [Bark](https://huggingface.co/docs/transformers/en/model_doc/bark)'s fine submodel. See the model documentation
+    for examples.
+
+    </Tip>
+
+    Args:
+        input_start_len (`int`):
+            The length of the initial input sequence.
+        semantic_vocab_size (`int`):
+            Vocabulary size of the semantic part, i.e number of tokens associated to the semantic vocabulary.
+        codebook_size (`int`):
+            Number of tokens associated to the codebook.
+    """
+
+    def __init__(self, input_start_len: int, semantic_vocab_size: int, codebook_size: int):
+        if not isinstance(input_start_len, int) or input_start_len < 0:
+            raise ValueError(f"`input_starting_length` has to be a non-negative integer, but is {input_start_len}")
+
+        self.input_start_len = input_start_len
+        self.semantic_vocab_size = semantic_vocab_size
+        self.codebook_size = codebook_size
+
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+        curr_len = input_ids.shape[-1]
+
+        # even -> first codebook, odd -> second codebook
+        is_first_codebook = ((curr_len - self.input_start_len) % 2) == 0
+
+        scores_processed = scores.clone()
+        if is_first_codebook:
+            scores_processed[:, : self.semantic_vocab_size] = -float("inf")
+            scores_processed[:, self.semantic_vocab_size + self.codebook_size :] = -float("inf")
+        else:
+            scores_processed[:, : self.semantic_vocab_size + self.codebook_size] = -float("inf")
+
+        return scores_processed
+
+
+class UnbatchedClassifierFreeGuidanceLogitsProcessor(LogitsProcessor):
+    r"""
+    Logits processor for Classifier-Free Guidance (CFG). The processors computes a weighted average across scores
+    from prompt conditional and prompt unconditional (or negative) logits, parameterized by the `guidance_scale`.
+    The unconditional scores are computed internally by prompting `model` with the `unconditional_ids` branch.
+
+    See [the paper](https://arxiv.org/abs/2306.17806) for more information.
+
+    Args:
+        guidance_scale (`float`):
+            The guidance scale for classifier free guidance (CFG). CFG is enabled by setting `guidance_scale != 1`.
+            Higher guidance scale encourages the model to generate samples that are more closely linked to the input
+            prompt, usually at the expense of poorer quality. A value smaller than 1 has the opposite effect, while
+            making the negative prompt provided with negative_prompt_ids (if any) act as a positive prompt.
+        model (`PreTrainedModel`):
+            The model computing the unconditional scores. Supposedly the same as the one computing the conditional
+            scores. Both models must use the same tokenizer.
+        unconditional_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of input sequence tokens in the vocabulary for the unconditional branch. If unset, will default to
+            the last token of the prompt.
+        unconditional_attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Attention mask for unconditional_ids.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether to cache key/values during the negative prompt forward pass.
+
+
+    Examples:
+
+    ```python
+    >>> from transformers import AutoTokenizer, AutoModelForCausalLM
+
+    >>> model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2")
+    >>> tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
+    >>> inputs = tokenizer(["Today, a dragon flew over Paris, France,"], return_tensors="pt")
+    >>> out = model.generate(inputs["input_ids"], guidance_scale=1.5)
+    >>> tokenizer.batch_decode(out, skip_special_tokens=True)[0]
+    'Today, a dragon flew over Paris, France, killing at least 50 people and injuring more than 100'
+
+    >>> # with a negative prompt
+    >>> neg_inputs = tokenizer(["A very happy event happened,"], return_tensors="pt")
+    >>> out = model.generate(inputs["input_ids"], guidance_scale=2, negative_prompt_ids=neg_inputs["input_ids"])
+    >>> tokenizer.batch_decode(out, skip_special_tokens=True)[0]
+    'Today, a dragon flew over Paris, France, killing at least 130 people. French media reported that'
+
+    >>> # with a positive prompt
+    >>> neg_inputs = tokenizer(["A very happy event happened,"], return_tensors="pt")
+    >>> out = model.generate(inputs["input_ids"], guidance_scale=0, negative_prompt_ids=neg_inputs["input_ids"])
+    >>> tokenizer.batch_decode(out, skip_special_tokens=True)[0]
+    "Today, a dragon flew over Paris, France, and I'm very happy to be here. I"
+    ```
+    """
+
+    def __init__(
+        self,
+        guidance_scale: float,
+        model,
+        unconditional_ids: Optional[torch.LongTensor] = None,
+        unconditional_attention_mask: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = True,
+    ):
+        self.guidance_scale = guidance_scale
+        self.model = model
+        self.unconditional_context = {
+            "input_ids": unconditional_ids,
+            "attention_mask": unconditional_attention_mask,
+            "use_cache": use_cache,
+            "past_key_values": None,
+            "first_pass": True,
+        }
+
+    def get_unconditional_logits(self, input_ids):
+        if self.unconditional_context["first_pass"]:
+            if self.unconditional_context["input_ids"] is None:
+                self.unconditional_context["input_ids"] = input_ids[:, -1:]
+            if self.unconditional_context["attention_mask"] is None:
+                self.unconditional_context["attention_mask"] = torch.ones_like(
+                    self.unconditional_context["input_ids"], dtype=torch.long
+                )
+            input_ids = self.unconditional_context["input_ids"]
+            attention_mask = self.unconditional_context["attention_mask"]
+            self.unconditional_context["first_pass"] = False
+        else:
+            attention_mask = torch.cat(
+                [
+                    self.unconditional_context["attention_mask"],
+                    torch.ones_like(input_ids[:, -1:], dtype=torch.long),
+                ],
+                dim=1,
+            )
+            if not self.unconditional_context["use_cache"]:
+                input_ids = torch.cat([self.unconditional_context["input_ids"], input_ids[:, -1:]], dim=1)
+            else:
+                input_ids = input_ids[:, -1:]
+            self.unconditional_context["input_ids"] = input_ids
+            self.unconditional_context["attention_mask"] = attention_mask
+
+        out = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            use_cache=self.unconditional_context["use_cache"],
+            past_key_values=self.unconditional_context["past_key_values"],
+        )
+        self.unconditional_context["past_key_values"] = out.get("past_key_values", None)
+
+        return out.logits
+
+    def __call__(self, input_ids, scores):
+        scores = torch.nn.functional.log_softmax(scores, dim=-1)
+        if self.guidance_scale == 1:
+            return scores
+
+        logits = self.get_unconditional_logits(input_ids)
+
+        unconditional_logits = torch.nn.functional.log_softmax(logits[:, -1], dim=-1)
+        scores_processed = self.guidance_scale * (scores - unconditional_logits) + unconditional_logits
+        return scores_processed
+
+
+class BarkEosPrioritizerLogitsProcessor(LogitsProcessor):
+    r"""This processor ensures that the EOS token is selected if its probability is greater than the `min_eos_p`.
+
+    <Tip warning={true}>
+
+    This logits processor is exclusively compatible with
+    [Bark](https://huggingface.co/docs/transformers/en/model_doc/bark). See the model documentation for examples.
+
+    </Tip>
+
+    Args:
+        eos_token_id (`Union[int, List[int], torch.Tensor]`):
+            The id(s) of the *end-of-sequence* token.
+        min_eos_p (`float`, *optional*):
+            Minimum end of speech threshold.
+    """
+
+    def __init__(self, eos_token_id: Union[int, List[int], torch.Tensor], min_eos_p: float, device: str = "cpu"):
+        if not isinstance(eos_token_id, torch.Tensor):
+            if isinstance(eos_token_id, int):
+                eos_token_id = [eos_token_id]
+            eos_token_id = torch.tensor(eos_token_id, device=device)
+        self.eos_token_id = eos_token_id
+
+        if torch.is_floating_point(eos_token_id) or (eos_token_id < 0).any():
+            raise ValueError(f"`eos_token_id` has to be a list of positive integers, but is {eos_token_id}")
+
+        if min_eos_p is not None and min_eos_p <= 0:
+            raise ValueError(f"`min_eos_p` has to be a positive float, but is {min_eos_p}")
+        self.min_eos_p = min_eos_p
+
+    @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+        scores_processed = scores
+        if self.min_eos_p:
+            probs = torch.nn.functional.softmax(scores.float(), dim=-1)
+            # create scores full of -inf except for the eos_token_id
+            early_stop_scores = torch.ones_like(scores) * -float("inf")
+            early_stop_scores[:, self.eos_token_id] = scores[:, self.eos_token_id]
+
+            do_early_stop = probs[:, self.eos_token_id] > self.min_eos_p
+            do_early_stop = torch.any(do_early_stop, dim=1, keepdim=True)
+            scores_processed = torch.where(do_early_stop, early_stop_scores, scores)
+
+        return scores_processed
+
+
+class WatermarkLogitsProcessor(LogitsProcessor):
+    r"""
+    Logits processor for watermarking generated text. The processor modifies model output scores by adding a small bias to
+    randomized set of "green" tokens before generating the next token. "Green" tokens selection process depends on the
+    `seeding_scheme` used. The code was based on the [original repo](https://github.com/jwkirchenbauer/lm-watermarking/tree/main).
+
+    The text generated by this `LogitsProcessor` can be detected using `WatermarkDetector`. See [`~WatermarkDetector.__call__`] for details,
+
+    See [the paper](https://arxiv.org/abs/2306.04634) for more information.
+
+    Args:
+        vocab_size (`int`):
+            The model tokenizer's vocab_size. Used to calculate "green" tokens ratio.
+        device (`str`):
+            The device where model is allocated.
+        greenlist_ratio (`float`, optional, *optional*, defaults to 0.25):
+            The ratio of "green" tokens used to the vocabulary size. Defaults to 0.25.
+        bias (`float`, optional, *optional*, defaults to 2.0):
+            The bias added to the selected "green" tokens' logits. Consider lowering the
+            `bias` if the text generation quality degrades. Recommended values are in the
+            range of [0.5, 2.0]. Defaults to 2.0.
+        hashing_key (`int`, optional, *optional*, defaults to 15485863):
+            Key used for hashing. If you deploy this watermark, we advise using another private key.
+            Defaults to 15485863 (the millionth prime).
+        seeding_scheme (`str`, optional, *optional*, defaults to `"lefthash"`):
+            The seeding scheme used for selecting "green" tokens. Accepts values:
+                - "lefthash" (default): "green" tokens selection depend on the last token (Algorithm 2 from paper)
+                - "selfhash": "green" tokens selection depends on the current token itself (Algorithm 3 from paper)
+                    The downside of this scheme is that it considers all possible next tokens and can be slower than "lefthash".
+            The context length of previous tokens to use in seeding. Higher context length makes watermarking more robust.
+        context_width (`int`, *optional*, defaults to 1):
+            The number of previous tokens to use when setting the seed.
+
+    Examples:
+
+    ```python
+    >>> from transformers import AutoTokenizer, AutoModelForCausalLM, WatermarkingConfig
+
+    >>> model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2")
+    >>> tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
+    >>> inputs = tokenizer(["Alice and Bob are"], return_tensors="pt")
+
+    >>> # normal generation
+    >>> out = model.generate(inputs["input_ids"], max_length=20, do_sample=False)
+    >>> tokenizer.batch_decode(out, skip_special_tokens=True)[0]
+    'Alice and Bob are both in the same room.\n\n"I\'m not sure if you\'re'
+
+    >>> # watermarked generation
+    >>> watermarking_config = WatermarkingConfig(bias=2.5, context_width=2, seeding_scheme="selfhash")
+    >>> out = model.generate(inputs["input_ids"], watermarking_config=watermarking_config, max_length=20, do_sample=False)
+    >>> tokenizer.batch_decode(out, skip_special_tokens=True)[0]
+    'Alice and Bob are both still alive and well and the story is pretty much a one-hour adventure'
+
+    >>> # to detect watermarked text use the WatermarkDetector class
+    >>> from transformers import WatermarkDetector
+    >>> detector = WatermarkDetector(model_config=model.config, device="cpu", watermarking_config= watermarking_config)
+    >>> detection_preds = detector(out)
+    >>> detection_preds
+    array([ True])
+    ```
+    """
+
+    def __init__(
+        self,
+        vocab_size,
+        device,
+        greenlist_ratio: float = 0.25,
+        bias: float = 2.0,
+        hashing_key: int = 15485863,
+        seeding_scheme: str = "lefthash",
+        context_width: int = 1,
+    ):
+        if seeding_scheme not in ["selfhash", "lefthash"]:
+            raise ValueError(f"seeding_scheme has to be one of [`selfhash`, `lefthash`], but found {seeding_scheme}")
+        if greenlist_ratio >= 1.0 or greenlist_ratio <= 0.0:
+            raise ValueError(
+                f"greenlist_ratio has be in range between 0.0 and 1.0, exclusively. but found {greenlist_ratio}"
+            )
+
+        self.vocab_size = vocab_size
+        self.greenlist_size = int(self.vocab_size * greenlist_ratio)
+        self.bias = bias
+        self.seeding_scheme = seeding_scheme
+        self.rng = torch.Generator(device=device)
+        self.hash_key = hashing_key
+        self.context_width = context_width
+
+        self.rng.manual_seed(hashing_key)
+        self.table_size = 1_000_003
+        self.fixed_table = torch.randperm(self.table_size, generator=self.rng, device=device)
+
+    def set_seed(self, input_seq: torch.LongTensor):
+        input_seq = input_seq[-self.context_width :]
+        if self.seeding_scheme == "selfhash":
+            a = self.fixed_table[input_seq % self.table_size] + 1
+            b = self.fixed_table[input_seq[-1] % self.table_size] + 1
+            seed = (self.hash_key * a * b).min().item()
+        else:
+            seed = self.hash_key * input_seq[-1].item()
+        self.rng.manual_seed(seed % (2**64 - 1))
+
+    def _get_greenlist_ids(self, input_seq: torch.LongTensor) -> torch.LongTensor:
+        self.set_seed(input_seq)
+        vocab_permutation = torch.randperm(self.vocab_size, device=input_seq.device, generator=self.rng)
+        greenlist_ids = vocab_permutation[: self.greenlist_size]
+        return greenlist_ids
+
+    def _score_rejection_sampling(self, input_seq: torch.LongTensor, scores: torch.FloatTensor) -> torch.LongTensor:
+        """
+        Generate greenlist based on current candidate next token. Reject and move on if necessary.
+        Runs for a fixed number of steps only for efficiency, since the methods is not batched.
+        """
+        final_greenlist = []
+        _, greedy_predictions = scores.sort(dim=-1, descending=True)
+
+        # 40 is an arbitrary number chosen to save compute and not run for long (taken from orig repo)
+        for i in range(40):
+            greenlist_ids = self._get_greenlist_ids(torch.cat([input_seq, greedy_predictions[i, None]], dim=-1))
+            if greedy_predictions[i] in greenlist_ids:
+                final_greenlist.append(greedy_predictions[i])
+        return torch.tensor(final_greenlist, device=input_seq.device)
+
+    @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+        if input_ids.shape[-1] < self.context_width:
+            logger.warning(
+                f"`input_ids` should have at least `{self.context_width}` tokens but has {input_ids.shape[-1]}. "
+                "The seeding will be skipped for this generation step!"
+            )
+            return scores
+
+        scores_processed = scores.clone()
+        for b_idx, input_seq in enumerate(input_ids):
+            if self.seeding_scheme == "selfhash":
+                greenlist_ids = self._score_rejection_sampling(input_seq, scores[b_idx])
+            else:
+                greenlist_ids = self._get_greenlist_ids(input_seq)
+            scores_processed[b_idx, greenlist_ids] = scores_processed[b_idx, greenlist_ids] + self.bias
+
+        return scores_processed
+
+
+class SynthIDTextWatermarkState:
+    """SynthID watermarking state."""
+
+    def __init__(
+        self,
+        batch_size: int,
+        ngram_len: int,
+        context_history_size: int,
+        device: torch.device,
+    ):
+        """Initializes the state.
+
+        Args:
+            batch_size (`int`): Batch size.
+            ngram_len (`int`): Ngram length.
+            context_history_size (`int`): Size of the tensor to keep track of seen contexts.
+            device (`int`): Device to use.
+        """
+        self.context = torch.zeros(
+            (batch_size, ngram_len - 1),
+            dtype=torch.int64,
+            device=device,
+        )
+        self.context_history = torch.zeros(
+            (batch_size, context_history_size),
+            dtype=torch.int64,
+            device=device,
+        )
+        self.num_calls = 0
+
+
+class SynthIDTextWatermarkLogitsProcessor(LogitsProcessor):
+    r"""
+    Logits processor that implements watermarking techniques for text generation models.
+    This class facilitates the application of SynthID text watermarking, a method for embedding imperceptible signals
+    into generated text to aid in detecting synthetic content. It operates by subtly manipulating the probabilities of
+    token selection during text generation in a manner that can be reliably recovered later for verification.
+
+    Key Features:
+    * **State Management:** Maintains internal state to track token sequences and generate watermarking keys
+    dynamically.
+
+    * **Key Generation:** Computes hashes based on token sequences and watermarking parameters to create unique keys
+    for each position.
+
+    * **G-Value Sampling:** Employs a pre-computed sampling table to sample watermarking values (g-values) based on
+    the generated keys.
+
+    * **Score Adjustment:** Applies calculated g-values to modify token probabilities during generation, embedding the
+    watermark.
+
+    * **Context Repetition Handling:** Incorporates logic to avoid watermarking tokens in repeated contexts,
+    preserving naturalness.
+
+    * **EOS Token Masking:** Supports masking end-of-sentence tokens to prevent their inclusion in watermarking
+    calculations.
+
+    * **Utility Functions:** Provides functions to compute g-values directly, check for context repetition, create
+    EOS token masks, and estimate expected mean g-values.
+
+    Refer to paper url: https://www.nature.com/articles/s41586-024-08025-4 for more details around this.
+
+    Args:
+        ngram_len (`int`):
+            Ngram length.
+        keys (`List[int]`):
+            A sequence of watermarking keys, one for each depth.
+        sampling_table_size (`int`):
+            Size of the sampling table.
+        sampling_table_seed (`int`):
+            Random seed to generate the sampling table.
+        context_history_size (`int`):
+            Size of the tensor to keep track of seen contexts.
+        device (`torch.device`):
+            Device to use.
+        skip_first_ngram_calls (`bool`, *optional*, defaults to `False`):
+            Whether to skip first ngram calls.
+        debug_mode (`bool`, optional, *optional*, defaults to `False`):
+            Logits are modified to uniform one got before watermarking modification is applied. This is to test the
+            implementation.
+
+    Examples:
+    ```python
+    >>> from transformers import AutoModelForCausalLM, AutoTokenizer, SynthIDTextWatermarkingConfig
+
+    >>> tokenizer = AutoTokenizer.from_pretrained('google/gemma-2-2b', padding_side="left")
+    >>> model = AutoModelForCausalLM.from_pretrained('google/gemma-2-2b')
+
+    >>> # SynthID Text configuration
+    >>> watermarking_config = SynthIDTextWatermarkingConfig(
+    ...     keys=[654, 400, 836, 123, 340, 443, 597, 160, 57],
+    ...     ngram_len=5,
+    ... )
+
+    >>> # Generation with watermarking
+    >>> tokenized_prompts = tokenizer(["Once upon a time, "], return_tensors="pt", padding=True)
+    >>> output_sequences = model.generate(
+    ...     **tokenized_prompts, watermarking_config=watermarking_config, do_sample=True, max_new_tokens=10
+    ... )
+    >>> watermarked_text = tokenizer.batch_decode(output_sequences, skip_special_tokens=True)
+    ```
+    """
+
+    def __init__(
+        self,
+        ngram_len: int,
+        keys: List[int],
+        sampling_table_size: int,
+        sampling_table_seed: int,
+        context_history_size: int,
+        device: torch.device,
+        skip_first_ngram_calls: bool = False,
+        debug_mode: bool = False,
+    ):
+        self.ngram_len = ngram_len
+        self.keys = torch.tensor(keys, device=device)
+
+        generator = torch.Generator(device=device).manual_seed(sampling_table_seed)
+        # A random sampling table is pre-computed and modulo table size is applied to map from a hash of ngram keys to
+        # g values, this is similar to the hashtable implementation used in
+        # https://github.com/facebookresearch/three_bricks. We note that the hashing employed in this repository is
+        # different from that used to watermark the Gemini App, and hence the detectors trained based on the
+        # hashing in this repository will not transfer to text generated by the Gemini App.
+        self.sampling_table = torch.randint(
+            low=0,
+            high=2,
+            size=(sampling_table_size,),
+            generator=generator,
+            device=device,
+        )
+        self.context_history_size = context_history_size
+        self.device = device
+        self.state = None
+        self.skip_first_ngram_calls = skip_first_ngram_calls
+        self.debug_mode = debug_mode
+
+    def _init_state(self, batch_size: int):
+        """Initializes the state."""
+        self.state = SynthIDTextWatermarkState(
+            batch_size=batch_size,
+            ngram_len=self.ngram_len,
+            context_history_size=self.context_history_size,
+            device=self.device,
+        )
+
+    def update_scores(self, scores: torch.FloatTensor, g_values: torch.FloatTensor) -> torch.FloatTensor:
+        """Updates scores using the g values.
+
+        We assume that the scores are in the log space.
+        Args:
+            scores (`torch.FloatTensor`): Scores (batch_size, vocab_size).
+            g_values (`torch.FloatTensor`): G valus (batch_size, vocab_size, depth).
+
+        Returns:
+            Updated scores (batch_size, vocab_size).
+        """
+        _, _, depth = g_values.shape
+
+        probs = torch.softmax(scores, dim=1)
+
+        for i in range(depth):
+            g_values_at_depth = g_values[:, :, i]
+            g_mass_at_depth = (g_values_at_depth * probs).sum(axis=1, keepdims=True)
+            probs = probs * (1 + g_values_at_depth - g_mass_at_depth)
+
+        log_probs = torch.log(probs)
+        log_probs = torch.where(torch.isfinite(log_probs), log_probs, torch.finfo(log_probs.dtype).min)
+        return log_probs
+
+    @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+        self._check_input_ids_shape(input_ids)
+        batch_size, vocab_size = scores.shape
+
+        if self.debug_mode:
+            scores = torch.ones_like(scores)
+
+        # Currently indices is just a arange to compute watermarking on the desnse logits.
+        all_indices = torch.stack([torch.arange(vocab_size, device=self.device) for _ in range(batch_size)])
+
+        if self.state is None:
+            # Initialize watermarking state if it does not exist.
+            self._init_state(batch_size)
+        else:
+            # Append last input id (which is the input id added in last call) to the
+            # previous context so we have the context to be used for current
+            # watermarking.
+            self.state.context = torch.concat(
+                (self.state.context, input_ids[:, -1:]),
+                dim=1,
+            )
+            self.state.context = self.state.context[:, 1:]
+
+        if self.state is None:
+            raise ValueError("self.state can't be None! Call `self._init_state` to initialize the state.")
+
+        self.state.num_calls += 1
+
+        # Don't watermark the first ngram_len - 1 tokens if set.
+        if self.skip_first_ngram_calls and self.state.num_calls < self.ngram_len:
+            return scores
+
+        # 2. Generate random keys for each ngram key combination.
+        ngram_keys, hash_result_with_just_context = self._compute_keys(self.state.context, all_indices)
+        # ngram_keys shape [batch_size, top_k, depth]
+
+        # 3. Sample g values.
+        g_values = self.sample_g_values(ngram_keys)
+        # g_values shape [batch_size, top_k, depth]
+
+        # 4. Modify scores.
+        updated_scores = self.update_scores(scores, g_values)
+        # updated scores shape [batch_size, top_k]
+
+        # 5. Check if the current watermarking context was previously used, if yes skip watermarking.
+        hash_result_with_just_context = hash_result_with_just_context[:, None]
+        is_repeated_context = (self.state.context_history == hash_result_with_just_context).any(
+            dim=1,
+            keepdim=True,
+        )
+        self.state.context_history = torch.concat(
+            (hash_result_with_just_context, self.state.context_history),
+            dim=1,
+        )[:, :-1]
+
+        updated_watermarked_scores = torch.where(
+            is_repeated_context,
+            input=scores,
+            other=updated_scores,
+        )
+        return updated_watermarked_scores
+
+    def accumulate_hash(
+        self,
+        current_hash: torch.LongTensor,
+        data: torch.LongTensor,
+        multiplier: int = 6364136223846793005,
+        increment: int = 1,
+    ) -> torch.LongTensor:
+        """
+        Accumulate hash of data on current hash.
+
+        Method uses adapted linear congruential generator with newlib/musl parameters.
+
+        This function has following property -
+        f(x, data[T]) = f(f(x, data[:T - 1]), data[T])
+
+        This function expects current_hash.shape and data.shape[:-1] to
+        match/broadcastable.
+
+        Args:
+            current_hash (`torch.LongTensor`):
+                (shape,)
+            data (`torch.LongTensor`):
+                (shape, tensor_len)
+            multiplier (`int`, optional, *optional*, defaults to 6364136223846793005):
+                multiplier of linear congruential generator
+            increment (`int`, optional, *optional*, defaults to 1):
+                increment of linear congruential generator
+
+        Returns:
+            updated hash (shape,)
+        """
+        for i in range(data.shape[-1]):
+            current_hash = torch.add(current_hash, data[..., i])
+            current_hash = torch.mul(current_hash, multiplier)
+            current_hash = torch.add(current_hash, increment)
+        return current_hash
+
+    def compute_ngram_keys(self, ngrams: torch.LongTensor) -> torch.LongTensor:
+        """Computes random keys for each ngram and depth.
+
+        Args:
+            ngrams (`torch.LongTensor`):
+                Ngrams (batch_size, num_ngrams, ngram_len).
+
+        Returns:
+            ngram keys (batch_size, num_ngrams, depth).
+        """
+        if len(ngrams.shape) != 3:
+            raise ValueError(
+                "Ngrams should be of shape (batch_size, num_ngrams, ngram_len), but" f" is {ngrams.shape}"
+            )
+        if ngrams.shape[2] != self.ngram_len:
+            raise ValueError(
+                "Ngrams should be of shape (batch_size, num_ngrams, ngram_len),"
+                f" where ngram_len is {self.ngram_len}, but is {ngrams.shape}"
+            )
+        batch_size, _, _ = ngrams.shape
+
+        hash_result = torch.ones(batch_size, device=self.device, dtype=torch.long)
+        # hash_result shape [batch_size,]
+        # ngrams shape [batch_size, num_ngrams, ngram_len]
+        hash_result = torch.vmap(self.accumulate_hash, in_dims=(None, 1), out_dims=1)(hash_result, ngrams)
+        # hash_result shape [batch_size, num_ngrams]
+
+        keys = self.keys[None, None, :, None]
+        # hash_result shape [batch_size, num_ngrams]
+        # keys shape [1, 1, depth, 1]
+        hash_result = torch.vmap(self.accumulate_hash, in_dims=(None, 2), out_dims=2)(hash_result, keys)
+        # hash_result shape [batch_size, num_ngrams, depth]
+
+        return hash_result
+
+    def _compute_keys(
+        self, n_minus_1_grams: torch.LongTensor, indices: torch.LongTensor
+    ) -> Tuple[torch.LongTensor, torch.LongTensor]:
+        """Computes random keys for each ngram and depth.
+
+        Args:
+            n_minus_1_grams (`torch.LongTensor`):
+                Ngrams (batch_size, ngram_len - 1).
+            indices (`torch.LongTensor`):
+                indices of the continuations (batch_size, num_indices)
+
+        Returns:
+            Ngram keys (batch_size, num_indices, depth).
+        """
+        batch_size, _ = n_minus_1_grams.shape
+
+        hash_result = torch.ones(batch_size, device=self.device, dtype=torch.long)
+        # First hash n_minus_1 gram, for each batch entry we have a single
+        # n_minus_1 gram context.
+        # hash_result shape [batch_size]
+        # n_minus_1_gram shape [batch_size, ngram_len - 1]
+        hash_result_with_just_context = self.accumulate_hash(hash_result, n_minus_1_grams)
+        # hash_result shape [batch_size,]
+        # Indices is of shape [batch_size, num_indices], so we make it
+        # [batch_size, num_indices, 1] so we can vmap over num_indices dim.
+        hash_result = torch.vmap(self.accumulate_hash, in_dims=(None, 1), out_dims=1)(
+            hash_result_with_just_context, indices[:, :, None]
+        )
+        # hash_result shape [batch_size, num_indices]
+        # Basically we have a hash for each batch entry and each indices
+        # Now we add watermarking keys to this hash.
+        # keys are of shape [depth,]
+        # We add batch, num_indices and data dimension to this making it
+        # [1, 1, depth, 1].
+        # So we can vmap over the depth dimension for compute_hash
+        keys = self.keys[None, None, :, None]
+        hash_result = torch.vmap(self.accumulate_hash, in_dims=(None, 2), out_dims=2)(hash_result, keys)
+        # hash_result shape should be [batch_size, num_indices, depth]
+        return hash_result, hash_result_with_just_context
+
+    def sample_g_values(self, ngram_keys: torch.LongTensor) -> torch.LongTensor:
+        """
+        Samples g values from Bernoulli distribution.
+
+        It is not possible to pass random keys in a vectorized way in torch. Instead
+        we pre-compute a random sampling table, and use apply modulo table size to
+        map from ngram keys (int64) to g values.
+
+        Args:
+            ngram_keys (`torch.LongTensor`):
+                Random keys (batch_size, num_ngrams, depth).
+
+        Returns:
+            G values (batch_size, num_ngrams, depth).
+        """
+        (sampling_table_size,) = self.sampling_table.shape
+        sampling_table = self.sampling_table.reshape((1, 1, sampling_table_size))
+        ngram_keys = ngram_keys % sampling_table_size
+        return torch.take_along_dim(sampling_table, indices=ngram_keys, dim=2)
+
+    def _check_input_ids_shape(self, input_ids: torch.LongTensor):
+        """Checks the shape of input ids."""
+        if len(input_ids.shape) != 2:
+            raise ValueError("Input ids should be of shape (batch_size, input_len), but is" f" {input_ids.shape}")
+
+    def compute_g_values(self, input_ids: torch.LongTensor) -> torch.LongTensor:
+        """
+        Computes g values for each ngram from the given sequence of tokens.
+
+        Args:
+            input_ids (`torch.LongTensor`):
+                Input token ids (batch_size, input_len).
+
+        Returns:
+            G values (batch_size, input_len - (ngram_len - 1), depth).
+        """
+        self._check_input_ids_shape(input_ids)
+        ngrams = input_ids.unfold(dimension=1, size=self.ngram_len, step=1)
+        ngram_keys = self.compute_ngram_keys(ngrams)
+        return self.sample_g_values(ngram_keys)
+
+    def compute_context_repetition_mask(self, input_ids: torch.LongTensor) -> torch.LongTensor:
+        """
+        Computes repetition mask.
+
+        0 and 1 stand for repeated and not repeated context n-1 grams respectively.
+
+        Args:
+            input_ids (`torch.LongTensor`):
+                Input token ids (batch_size, input_len).
+
+        Returns:
+            Repetitions mask (batch_size, input_len - (ngram_len - 1)).
+        """
+        self._check_input_ids_shape(input_ids)
+        batch_size, _ = input_ids.shape
+        state = SynthIDTextWatermarkState(
+            batch_size=batch_size,
+            ngram_len=self.ngram_len,
+            context_history_size=self.context_history_size,
+            device=self.device,
+        )
+        contexts = input_ids[:, :-1].unfold(
+            dimension=1,
+            size=self.ngram_len - 1,
+            step=1,
+        )
+        _, num_contexts, _ = contexts.shape
+
+        are_repeated_contexts = []
+        for i in range(num_contexts):
+            context = contexts[:, i, :]
+            hash_result = torch.ones(batch_size, device=self.device, dtype=torch.long)
+            context_hash = self.accumulate_hash(hash_result, context)[:, None]
+            is_repeated_context = (state.context_history == context_hash).any(
+                dim=1,
+                keepdim=True,
+            )
+            are_repeated_contexts.append(is_repeated_context)
+            state.context_history = torch.concat(
+                (context_hash, state.context_history),
+                dim=1,
+            )[:, :-1]
+        are_repeated_contexts = torch.concat(are_repeated_contexts, dim=1)
+
+        return torch.logical_not(are_repeated_contexts)
+
+    def compute_eos_token_mask(self, input_ids: torch.LongTensor, eos_token_id: int) -> torch.LongTensor:
+        """
+        Computes repetitions mask.
+
+        1 stands for ngrams that don't contain EOS tokens and vice versa.
+
+        Args:
+            input_ids (`torch.LongTensor`):
+                Input token ids (batch_size, input_len).
+            eos_token_id (`int`):
+                EOS token ID.
+
+        Returns:
+            EOS token mask (batch_size, input_len).
+        """
+        self._check_input_ids_shape(input_ids)
+        noneos_masks = []
+        all_eos_equated = input_ids == eos_token_id
+        for eos_equated in all_eos_equated:
+            nonzero_idx = torch.nonzero(eos_equated)
+            noneos_mask = torch.ones_like(eos_equated)
+            if nonzero_idx.shape[0] != 0:
+                noneos_mask[nonzero_idx[0][0] :] = 0
+            noneos_masks.append(noneos_mask)
+        return torch.stack(noneos_masks, dim=0)
+
+    def expected_mean_g_value(self, vocab_size: int, coinflip_prob: float = 0.5) -> float:
+        """
+        Compute expected mean g-value after watermarking, assuming uniform LM dist.
+
+        This is the theoretical expected value for single-layer watermarking.
+
+        Args:
+            vocab_size (`int`):
+                The size of the vocabulary.
+            coinflip_prob arg_name (`float`, *optional*, defaults to 0.5):
+                Probability of 1 in boolean prf.
+
+        Returns:
+            The expected mean g-value for watermarked text.
+        """
+        return coinflip_prob + coinflip_prob * (1 - coinflip_prob) * (1 - (1 / vocab_size))
diff --git a/.venv/lib/python3.11/site-packages/transformers/generation/stopping_criteria.py b/.venv/lib/python3.11/site-packages/transformers/generation/stopping_criteria.py
new file mode 100644
index 0000000000000000000000000000000000000000..b950a69f8b64929a2c9fee87f28d44585ebb93ee
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/generation/stopping_criteria.py
@@ -0,0 +1,514 @@
+import time
+import warnings
+from abc import ABC
+from collections import OrderedDict
+from copy import deepcopy
+from typing import Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+from torch.nn import functional as F
+
+from ..pytorch_utils import isin_mps_friendly
+from ..tokenization_utils_base import PreTrainedTokenizerBase
+from ..utils import add_start_docstrings, logging
+
+
+logger = logging.get_logger(__name__)
+# We maintain a module-level cache of the embedding vectors for the stop string criterion
+# because they are slow to compute
+STOP_STRING_EMBEDDING_CACHE = OrderedDict()
+
+
+STOPPING_CRITERIA_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        scores (`torch.FloatTensor` of shape `(batch_size, config.vocab_size)`):
+            Prediction scores of a language modeling head. These can be scores for each vocabulary token before SoftMax
+            or scores for each vocabulary token after SoftMax. If this stopping criteria depends on the `scores` input,
+            make sure you pass `return_dict_in_generate=True, output_scores=True` to `generate`.
+        kwargs (`Dict[str, Any]`, *optional*):
+            Additional stopping criteria specific kwargs.
+
+    Return:
+        `torch.BoolTensor`. (`torch.BoolTensor` of shape `(batch_size, 1)`), where `True` indicates we stop generation
+            for a particular row, `True` indicates we should continue.
+
+"""
+
+
+class StoppingCriteria(ABC):
+    """Abstract base class for all stopping criteria that can be applied during generation.
+
+    If your stopping criteria depends on the `scores` input, make sure you pass `return_dict_in_generate=True,
+    output_scores=True` to `generate`.
+    """
+
+    @add_start_docstrings(STOPPING_CRITERIA_INPUTS_DOCSTRING)
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> torch.BoolTensor:
+        raise NotImplementedError("StoppingCriteria needs to be subclassed")
+
+
+class MaxLengthCriteria(StoppingCriteria):
+    """
+    This class can be used to stop generation whenever the full generated number of tokens exceeds `max_length`. Keep
+    in mind for decoder-only type of transformers, this will include the initial prompted tokens.
+
+    Args:
+        max_length (`int`):
+            The maximum length that the output sequence can have in number of tokens.
+        max_position_embeddings (`int`, *optional*):
+            The maximum model length, as defined by the model's `config.max_position_embeddings` attribute.
+    """
+
+    def __init__(self, max_length: int, max_position_embeddings: Optional[int] = None):
+        self.max_length = max_length
+        self.max_position_embeddings = max_position_embeddings
+
+    @add_start_docstrings(STOPPING_CRITERIA_INPUTS_DOCSTRING)
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> torch.BoolTensor:
+        cur_len = input_ids.shape[-1]
+        is_done = cur_len >= self.max_length
+        if self.max_position_embeddings is not None and not is_done and cur_len >= self.max_position_embeddings:
+            logger.warning_once(
+                "This is a friendly reminder - the current text generation call will exceed the model's predefined "
+                f"maximum length ({self.max_position_embeddings}). Depending on the model, you may observe "
+                "exceptions, performance degradation, or nothing at all."
+            )
+        return torch.full((input_ids.shape[0],), is_done, device=input_ids.device, dtype=torch.bool)
+
+
+class MaxTimeCriteria(StoppingCriteria):
+    """
+    This class can be used to stop generation whenever the full generation exceeds some amount of time. By default, the
+    time will start being counted when you initialize this function. You can override this by passing an
+    `initial_time`.
+
+    Args:
+        max_time (`float`):
+            The maximum allowed time in seconds for the generation.
+        initial_time (`float`, *optional*, defaults to `time.time()`):
+            The start of the generation allowed time.
+    """
+
+    def __init__(self, max_time: float, initial_timestamp: Optional[float] = None):
+        self.max_time = max_time
+        self.initial_timestamp = time.time() if initial_timestamp is None else initial_timestamp
+
+    @add_start_docstrings(STOPPING_CRITERIA_INPUTS_DOCSTRING)
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> torch.BoolTensor:
+        is_done = time.time() - self.initial_timestamp > self.max_time
+        return torch.full((input_ids.shape[0],), is_done, device=input_ids.device, dtype=torch.bool)
+
+
+class StopStringCriteria(StoppingCriteria):
+    """
+    This class can be used to stop generation whenever specific string sequences are generated. It preprocesses
+    the strings together with the tokenizer vocab to find positions where tokens can validly complete the stop strings.
+
+    Generation is stopped as soon as a token is generated that completes any of the stop strings.
+    We want to catch any instance in which the stop string would be present in the decoded output, which means
+    we must also catch cases with "overhangs" off one or both ends. To make this more concrete, for the stop string
+    "stop", any of the following token sequences would trigger the match:
+
+    - ["st", "op"]
+    - ["stop"]
+    - ["st", "opera"]
+    - ["sto", "pper"]
+    - ["las", "topper"]
+    - ["s", "to", "pped"]
+
+    Note that a match will only be triggered if the stop string is at the end of the generated sequence. In other
+    words, these sequences will not trigger a match:
+
+    - ["stop", "at"]
+    - ["st", "op", "at"]
+    - ["st", "opera", "tion"]
+
+    The reason these are not a match is that the stop string does not overlap with the final token. If you can remove
+    one or more tokens from the end of the sequence without destroying the stop string, then this criterion will not
+    match that stop string. This is by design; because this check is run after each token is generated, we can't miss a
+    valid stop string if one is generated, but we don't want to halt generation just because the stop string exists
+    somewhere in the past input_ids.
+
+    How is the match actually performed, though? We do it in quite a confusing way, because we want the entire match
+    process to be compilable with Torch or XLA, which means we cannot use standard string methods. However, it is possible,
+    with some work, to do string matching with pure tensor operations. We'll begin by describing the algorithm we use
+    with standard string operations, and then at the end we'll explain how this is converted to pure tensor operations.
+
+    The key to the algorithm is an observation: Because the stop string must overlap with the end of the token sequence, we can start at
+    the end of the sequence and work backwards. Specifically, we check that there is an overlap between the start of
+    the final token and the end of the stop_string, or to put it another way, stop_string[-i:] == token[:i] for
+    some i > 0. If you look at the positive examples above, you'll see the last token in all of them fulfills this
+    property:
+
+    - ["st", "op"] (overlap is "op", overlap length == 2)
+    - ["stop"]  (overlap is "stop", overlap length == 4)
+    - ["st", "opera"]  (overlap is "op", overlap length == 2)
+    - ["sto", "pper"]  (overlap is "p", overlap length == 1)
+    - ["las", "topper"]  (overlap is "top", overlap length == 3)
+    - ["s", "to", "pped"]  (overlap is "p", overlap length == 1)
+
+    It's impossible to construct a matching sequence that does not have this property (feel free to verify this
+    yourself). However, although this overlap between the start of the final token and the end of the stop string is
+    necessary for a match, it is not sufficient. We also need to check that the rest of the token sequence is
+    consistent with the stop string.
+
+    How do we do that? Let's use ["s", "to", "pped"] as an example. We know that the final token, "pped", has an
+    overlap of 1 with the stop string, "stop". We then go back to the previous token, "to". Since we have already
+    matched 1 character from the stop string, the remainder to check is "sto". We check that the next token "to"
+    matches the end of the remainder, which it does. We have now matched 3 characters from the stop string, and the
+    remainder to match is "s". We go back to the previous token again, which is also "s". This is a match, and so
+    we have matched the entire stop string.
+
+    How does it work when the tokens run off the start of the stop string, though? Let's consider the example of
+    ["las", "topper"]. The final token, "topper", has an overlap of 3 with the stop string, "stop". Therefore,
+    the remaining stop string to match is "s". We go back to the previous token, "las". Because the remainder to
+    match is just "s", with length 1, we consider only the final 1 character from the token, which is "s". This
+    matches the stop string, and so the entire string is matched.
+
+    How do we compute these matches with tensor operations, though? Simply: we efficiently precompute the necessary
+    information for all tokens! For every token, we compute:
+    - Its overlap with the end of the stop string, if any
+    - The positions inside the stop string where the token matches, including matches that run off the start.
+    - The total length of the token
+
+    For example, for the token "pped", we would compute an end overlap of 1, no internal matching positions,
+    and a length of 4. For the token "to", we would compute no end overlap, a single internal matching position
+    of 1 (counting from the end), and a length of 2. For the token "s", we would compute no end overlap,
+    a single internal matching position of 3 (again counting from the end) and a length of 1.
+
+    As long as we have this information, we can execute the algorithm above without any string comparison
+    operations. We simply perform the following steps:
+    - Check if the final token has an end-overlap with the start string
+    - Continue backwards, keeping track of how much of the stop string we've matched so far
+    - At each point, check if the next token has the current position as one of its valid positions
+    - Continue until either a match fails, or we completely match the whole stop string
+
+    Again, consider ["s", "to", "pped"] as an example. "pped" has an end overlap of 1, so we can begin a match.
+    We have matched 1 character so far, so we check that the next token "to", has 1 as a valid position (again,
+    counting from the end). It does, so we add the length of "to" to our position tracker. We have now matched
+    3 characters, so we check that the next token "s" has 3 as a valid position. It does, so we add its length
+    to the position tracker. The position tracker is now 4, which is the length of the stop string. We have matched the
+    entire stop string.
+
+    In the second case, ["las", "topper"], "topper" has an end overlap of 3, so we can begin a match. We have
+    matched 3 characters so far, so we check that the next token "las" has 3 as a valid position. It does, because we
+    allow tokens to match positions that run off the start of the stop string. We add its length to the position
+    tracker. The position tracker is now 6, which is greater than the length of the stop string! Don't panic, though -
+    this also counts as a match of the stop string. We have matched the entire stop string.
+
+
+    Args:
+        tokenizer (`PreTrainedTokenizer`):
+            The model's associated tokenizer (necessary to extract vocab and tokenize the termination sequences)
+        stop_strings (`Union[str, List[str]]`):
+            A list of strings that should end generation. If a string is passed, it will be treated like a
+            list with a single element.
+
+    Examples:
+
+    ```python
+    >>> from transformers import AutoModelForCausalLM, AutoTokenizer
+
+    >>> tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2")
+    >>> model = AutoModelForCausalLM.from_pretrained("microsoft/phi-2")
+    >>> inputs = tokenizer("The biggest states in the USA by land area:", return_tensors="pt")
+
+    >>> gen_out = model.generate(**inputs)
+    >>> print(tokenizer.batch_decode(gen_out, skip_special_tokens=True)[0])
+    The biggest states in the USA by land area:
+    - Alaska
+    - Texas
+    - California
+
+    >>> # Passing one or more stop strings will halt generation after those strings are emitted
+    >>> # Note that generating with stop strings requires you to pass the tokenizer too
+    >>> gen_out = model.generate(**inputs, stop_strings=["Texas"], tokenizer=tokenizer)
+    >>> print(tokenizer.batch_decode(gen_out, skip_special_tokens=True)[0])
+    The biggest states in the USA by land area:
+    - Alaska
+    - Texas
+    ```
+    """
+
+    def __init__(self, tokenizer: PreTrainedTokenizerBase, stop_strings: Union[str, List[str]]):
+        if isinstance(stop_strings, str):
+            stop_strings = [stop_strings]
+        self.stop_strings: Tuple[str, ...] = tuple(stop_strings)
+        vocab = tokenizer.get_vocab()
+        token_list, token_indices = tuple(vocab.keys()), tuple(vocab.values())
+        self.embedding_vec, self.max_valid_positions, self.max_valid_end_lens = self.clean_and_embed_tokens_with_cache(
+            token_list, token_indices, self.stop_strings, tokenizer
+        )
+
+        self.maximum_token_len = max([len(stop_string) for stop_string in self.stop_strings])
+        self.num_stop_strings = len(self.stop_strings)
+        self.target_lens = torch.tensor([len(stop_string) for stop_string in stop_strings], dtype=torch.int32)
+
+    def clean_and_embed_tokens_with_cache(self, token_list, token_indices, stop_strings, tokenizer):
+        # We don't use the tokenizer in the cache key, because I don't trust it to have well-behaved equality
+        if (token_list, token_indices, stop_strings) in STOP_STRING_EMBEDDING_CACHE:
+            embedding_vec, max_valid_positions, max_valid_end_lens = STOP_STRING_EMBEDDING_CACHE[
+                (token_list, token_indices, self.stop_strings)
+            ]
+            STOP_STRING_EMBEDDING_CACHE.move_to_end((token_list, token_indices, stop_strings))
+        else:
+            clean_token_list, clean_token_indices = self.clean_tokenizer_vocab(tokenizer)
+            embedding_vec, max_valid_positions, max_valid_end_lens = self._stop_string_create_embedding_vec(
+                clean_token_list, clean_token_indices, stop_strings
+            )
+            STOP_STRING_EMBEDDING_CACHE[(token_list, token_indices, stop_strings)] = (
+                embedding_vec,
+                max_valid_positions,
+                max_valid_end_lens,
+            )
+            if len(STOP_STRING_EMBEDDING_CACHE) > 8:
+                STOP_STRING_EMBEDDING_CACHE.popitem(last=False)  # Pop from the start, the least recently used item
+        return embedding_vec, max_valid_positions, max_valid_end_lens
+
+    @staticmethod
+    def clean_tokenizer_vocab(tokenizer, static_prefix="abcdef"):
+        """
+        This method turns a tokenizer vocab into a "clean" vocab where each token represents the actual string
+        it will yield, without any special prefixes like "##" or "Ġ". This is trickier than it looks - the method
+        tokenizer.convert_tokens_to_string() does not always return the correct string because of issues with prefix
+        space addition/removal. To work around this, we add a static prefix to the start of the token, then remove
+        it (and any prefix that may have been introduced with it) after calling convert_tokens_to_string().
+        """
+        vocab = tokenizer.get_vocab()
+        clean_token_list = []
+        clean_token_indices = []
+        sentence_base = tokenizer(static_prefix, add_special_tokens=False)["input_ids"]
+        tokens_base = [tokenizer._convert_id_to_token(tok) for tok in sentence_base]
+        for token, token_idx in vocab.items():
+            token_string = tokenizer.convert_tokens_to_string(tokens_base + [token])
+            token_string = token_string[token_string.index(static_prefix) + len(static_prefix) :]
+            clean_token_list.append(token_string)
+            clean_token_indices.append(token_idx)
+        return tuple(clean_token_list), tuple(clean_token_indices)
+
+    @staticmethod
+    def _stop_string_get_matching_positions(
+        token_list, token_indices, stop_strings
+    ) -> Tuple[Dict[str, Dict[str, List[int]]], Dict[str, Dict[str, List[int]]]]:
+        """This function preprocesses stop strings and the tokenizer vocabulary to determine where tokens can
+        validly appear in the stop strings. For each token, it computes a list of positions in the stop string where the
+        token appears, as well as a list of the possible "end overlaps" for that token - that is, the number of characters
+        from the end of the stop string that overlap with the start of the token, which can have more than one value.
+
+        The reason for computing these may seem a bit cryptic - please see the docstring for StopStringCriteria for a full
+        explanation of what these values are for!"""
+
+        token_valid_positions = {}
+        token_end_overlaps = {}
+        for stop_string in stop_strings:
+            reversed_stop_string = stop_string[::-1]
+            token_valid_positions[stop_string] = {}
+            token_end_overlaps[stop_string] = {}
+            for token, tok_idx in zip(token_list, token_indices):
+                reversed_token = token[::-1]
+                matching_positions = []
+                possible_end_lengths = []
+                for i in range(1 - len(token), len(stop_string)):
+                    if i < 0:
+                        tok = reversed_token[-i:]
+                        i = 0
+                    else:
+                        tok = reversed_token
+                    stop = reversed_stop_string[i : i + len(tok)]
+                    if tok.startswith(stop):
+                        if i == 0:
+                            possible_end_lengths.append(min(len(tok), len(stop)))
+                        else:
+                            matching_positions.append(i)
+
+                if matching_positions:
+                    token_valid_positions[stop_string][tok_idx] = matching_positions
+                if possible_end_lengths:
+                    token_end_overlaps[stop_string][tok_idx] = possible_end_lengths
+        return token_valid_positions, token_end_overlaps
+
+    @staticmethod
+    def _stop_string_create_embedding_vec(token_list, token_indices, stop_strings) -> Dict[str, torch.tensor]:
+        """This function precomputes everything needed for the run-time checks in StopStringCriteria, and packs
+        them into an embedding tensor that can be accessed with pure tensor operations. For the specifics of the values
+        that are precomputed and what they are used for, please refer to the StopStringCriteria docstring!"""
+        token_valid_positions, token_end_overlaps = StopStringCriteria._stop_string_get_matching_positions(
+            token_list, token_indices, stop_strings
+        )
+        all_valid_positions = [len(val) for positions in token_valid_positions.values() for val in positions.values()]
+        # In some cases, tokens may have no valid internal positions (such as single-character stop strings), so
+        # we need a fallback to handle this case
+        max_valid_positions = max(all_valid_positions) if all_valid_positions else 1
+        # There should always be at least one valid end_len, however, so no fallback needed here
+        valid_end_lens = [len(val) for positions in token_end_overlaps.values() for val in positions.values()]
+        if not valid_end_lens:
+            raise ValueError(
+                "Stop string preprocessing was unable to identify tokens matching one or more of the "
+                "supplied stop string(s). This is most often caused by the stop "
+                "strings containing unusual characters that are not in the tokenizer vocabulary."
+            )
+        max_valid_end_lens = max(valid_end_lens)
+        vec_size = len(stop_strings) * (max_valid_positions + max_valid_end_lens) + 1
+        gather_vec = np.full((len(token_list), vec_size), dtype=np.int32, fill_value=-1)
+
+        for i, stop_string in enumerate(stop_strings):
+            positions = token_valid_positions[stop_string]
+            end_lens = token_end_overlaps[stop_string]
+
+            # Since this is lots of very small assignments of lists, we build it with numpy rather
+            # than torch for speed + simplicity, then convert to torch at the end
+            for token_idx, valid_positions in positions.items():
+                gather_vec[token_idx, max_valid_positions * i : max_valid_positions * i + len(valid_positions)] = (
+                    valid_positions
+                )
+            for token_idx, possible_end_lens in end_lens.items():
+                gather_vec[
+                    token_idx,
+                    max_valid_positions * len(stop_strings) + max_valid_end_lens * i : max_valid_positions
+                    * len(stop_strings)
+                    + max_valid_end_lens * i
+                    + len(possible_end_lens),
+                ] = possible_end_lens
+            for token, token_idx in zip(token_list, token_indices):
+                gather_vec[token_idx, -1] = len(token)
+
+        gather_vec = torch.tensor(gather_vec, dtype=torch.int32)
+
+        return gather_vec, max_valid_positions, max_valid_end_lens
+
+    @add_start_docstrings(STOPPING_CRITERIA_INPUTS_DOCSTRING)
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> torch.Tensor:
+        self.embedding_vec = self.embedding_vec.to(input_ids.device)
+        self.target_lens = self.target_lens.to(input_ids.device)
+        # The maximum length we need to consider is 1 token per character. Note that input_ids can also be
+        # *shorter* than the global max, and the code below should be ready for that
+        input_ids = input_ids[:, -self.maximum_token_len :]
+
+        # Flip input_ids because we're only matching strings at the end of the generated sequence
+        flipped_ids = torch.flip(input_ids, (1,))
+
+        # Size of the vector of positions a single token can match
+        max_valid_positions = self.max_valid_positions
+
+        # The embedding vec contains the valid positions, end_lengths and total lengths for each token
+        embedded = F.embedding(flipped_ids, self.embedding_vec)
+
+        # Now we split the embedding vector. valid_positions is the positions in the stop string the token can fit
+        valid_positions = embedded[:, 1:, : max_valid_positions * self.num_stop_strings].unflatten(
+            -1, (self.num_stop_strings, -1)
+        )
+        # end_lengths is the number of characters from the string, counting from the end, that the token
+        # contains. It can have multiple values if the same token can overlap different end lengths
+        end_lengths = embedded[:, :1, max_valid_positions * self.num_stop_strings : -1].unflatten(
+            -1, (self.num_stop_strings, -1)
+        )
+        # Lengths is the total length of each token. Unlike the others, it always has a single value
+        lengths = embedded[:, 1:, None, -1:]  # Insert a dummy dimension for stop_strings even though lengths are const
+
+        # Concatenate lengths onto each possible end_lengths value
+        lengths = lengths.expand((-1, -1, end_lengths.shape[-2], end_lengths.shape[-1]))
+        lengths_with_ends = torch.cat([end_lengths, lengths], dim=1)
+
+        # cumsum() to get the number of matched characters in the stop string after each token
+        cumsum = lengths_with_ends.cumsum(dim=1)  # B x maximum_token_len x num_stop_strings x max_valid_end_lens
+
+        # The calculation above assumes that all tokens are in valid positions. Now we mask the ones that are not.
+        # First, tokens match the start of the string if they have a positive value in the end_lengths vector
+        initial_match = end_lengths > 0
+
+        # Tokens continue the string if the cumsum() so far is one of the valid positions for that token
+        # Note that we're actually tracking one cumsum() for for each possible end_length
+        later_match = torch.any(cumsum[:, :-1, :, None] == valid_positions[:, :, :, :, None], axis=-2)
+
+        # The match vector is a boolean vector that indicates which positions have valid tokens
+        match = torch.cat([initial_match, later_match], dim=1)
+
+        # Once a single position does not match, all positions following that position are masked
+        mask = (~match).cumsum(dim=1, dtype=torch.int32)
+        mask = mask == 0
+
+        # The string is matched if we reached a cumsum equal to or greater than the length of the string
+        # before hitting the mask
+        string_matches = torch.amax(cumsum * mask, dim=(1, -1)) >= self.target_lens[None, :]
+
+        # We return a per-sample vector that is True if any stop string is matched for that sample
+        return torch.any(string_matches, dim=-1)
+
+
+class EosTokenCriteria(StoppingCriteria):
+    """
+    This class can be used to stop generation whenever the "end-of-sequence" token is generated.
+    By default, it uses the `model.generation_config.eos_token_id`.
+
+    Args:
+        eos_token_id (`Union[int, List[int], torch.Tensor]`):
+            The id(s) of the *end-of-sequence* token.
+    """
+
+    def __init__(self, eos_token_id: Union[int, List[int], torch.Tensor]):
+        if not isinstance(eos_token_id, torch.Tensor):
+            if isinstance(eos_token_id, int):
+                eos_token_id = [eos_token_id]
+            eos_token_id = torch.tensor(eos_token_id)
+        self.eos_token_id = eos_token_id
+
+    @add_start_docstrings(STOPPING_CRITERIA_INPUTS_DOCSTRING)
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> torch.BoolTensor:
+        self.eos_token_id = self.eos_token_id.to(input_ids.device)
+        is_done = isin_mps_friendly(input_ids[:, -1], self.eos_token_id)
+        return is_done
+
+
+class ConfidenceCriteria(StoppingCriteria):
+    """
+    This class can be used to stop generation whenever assistant model's confidence in its prediction for the current token is lower than the threshold
+        `model.generation_config.assistant_confidence_threshold` even if the number of speculative tokens (defined by `num_assistant_tokens`) is not yet reached.
+
+    Args:
+        assistant_confidence_threshold (`float`):
+            The value of the threshold.
+    """
+
+    def __init__(self, assistant_confidence_threshold):
+        self.assistant_confidence_threshold = assistant_confidence_threshold
+
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> torch.BoolTensor:
+        probs = scores[-1].softmax(-1)
+        p = probs[0, input_ids[0, -1]].item()
+        if p < self.assistant_confidence_threshold:
+            return True
+        return False
+
+
+class StoppingCriteriaList(list):
+    @add_start_docstrings(STOPPING_CRITERIA_INPUTS_DOCSTRING)
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> torch.BoolTensor:
+        is_done = torch.full((input_ids.shape[0],), False, device=input_ids.device, dtype=torch.bool)
+        for criteria in self:
+            is_done = is_done | criteria(input_ids, scores, **kwargs)
+        return is_done
+
+    @property
+    def max_length(self) -> Optional[int]:
+        for stopping_criterium in self:
+            if isinstance(stopping_criterium, MaxLengthCriteria):
+                return stopping_criterium.max_length
+        return None
+
+
+def validate_stopping_criteria(stopping_criteria: StoppingCriteriaList, max_length: int) -> StoppingCriteriaList:
+    stopping_max_length = stopping_criteria.max_length
+    new_stopping_criteria = deepcopy(stopping_criteria)
+    if stopping_max_length is not None and stopping_max_length != max_length:
+        warnings.warn("You set different `max_length` for stopping criteria and `max_length` parameter", UserWarning)
+    elif stopping_max_length is None:
+        new_stopping_criteria.append(MaxLengthCriteria(max_length=max_length))
+    return new_stopping_criteria
diff --git a/.venv/lib/python3.11/site-packages/transformers/generation/streamers.py b/.venv/lib/python3.11/site-packages/transformers/generation/streamers.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c25e343001577d41745e2cf033ffa30f70bd075
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/generation/streamers.py
@@ -0,0 +1,318 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import asyncio
+from queue import Queue
+from typing import TYPE_CHECKING, Optional
+
+
+if TYPE_CHECKING:
+    from ..models.auto import AutoTokenizer
+
+
+class BaseStreamer:
+    """
+    Base class from which `.generate()` streamers should inherit.
+    """
+
+    def put(self, value):
+        """Function that is called by `.generate()` to push new tokens"""
+        raise NotImplementedError()
+
+    def end(self):
+        """Function that is called by `.generate()` to signal the end of generation"""
+        raise NotImplementedError()
+
+
+class TextStreamer(BaseStreamer):
+    """
+    Simple text streamer that prints the token(s) to stdout as soon as entire words are formed.
+
+    <Tip warning={true}>
+
+    The API for the streamer classes is still under development and may change in the future.
+
+    </Tip>
+
+    Parameters:
+        tokenizer (`AutoTokenizer`):
+            The tokenized used to decode the tokens.
+        skip_prompt (`bool`, *optional*, defaults to `False`):
+            Whether to skip the prompt to `.generate()` or not. Useful e.g. for chatbots.
+        decode_kwargs (`dict`, *optional*):
+            Additional keyword arguments to pass to the tokenizer's `decode` method.
+
+    Examples:
+
+        ```python
+        >>> from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer
+
+        >>> tok = AutoTokenizer.from_pretrained("openai-community/gpt2")
+        >>> model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2")
+        >>> inputs = tok(["An increasing sequence: one,"], return_tensors="pt")
+        >>> streamer = TextStreamer(tok)
+
+        >>> # Despite returning the usual output, the streamer will also print the generated text to stdout.
+        >>> _ = model.generate(**inputs, streamer=streamer, max_new_tokens=20)
+        An increasing sequence: one, two, three, four, five, six, seven, eight, nine, ten, eleven,
+        ```
+    """
+
+    def __init__(self, tokenizer: "AutoTokenizer", skip_prompt: bool = False, **decode_kwargs):
+        self.tokenizer = tokenizer
+        self.skip_prompt = skip_prompt
+        self.decode_kwargs = decode_kwargs
+
+        # variables used in the streaming process
+        self.token_cache = []
+        self.print_len = 0
+        self.next_tokens_are_prompt = True
+
+    def put(self, value):
+        """
+        Receives tokens, decodes them, and prints them to stdout as soon as they form entire words.
+        """
+        if len(value.shape) > 1 and value.shape[0] > 1:
+            raise ValueError("TextStreamer only supports batch size 1")
+        elif len(value.shape) > 1:
+            value = value[0]
+
+        if self.skip_prompt and self.next_tokens_are_prompt:
+            self.next_tokens_are_prompt = False
+            return
+
+        # Add the new token to the cache and decodes the entire thing.
+        self.token_cache.extend(value.tolist())
+        text = self.tokenizer.decode(self.token_cache, **self.decode_kwargs)
+
+        # After the symbol for a new line, we flush the cache.
+        if text.endswith("\n"):
+            printable_text = text[self.print_len :]
+            self.token_cache = []
+            self.print_len = 0
+        # If the last token is a CJK character, we print the characters.
+        elif len(text) > 0 and self._is_chinese_char(ord(text[-1])):
+            printable_text = text[self.print_len :]
+            self.print_len += len(printable_text)
+        # Otherwise, prints until the last space char (simple heuristic to avoid printing incomplete words,
+        # which may change with the subsequent token -- there are probably smarter ways to do this!)
+        else:
+            printable_text = text[self.print_len : text.rfind(" ") + 1]
+            self.print_len += len(printable_text)
+
+        self.on_finalized_text(printable_text)
+
+    def end(self):
+        """Flushes any remaining cache and prints a newline to stdout."""
+        # Flush the cache, if it exists
+        if len(self.token_cache) > 0:
+            text = self.tokenizer.decode(self.token_cache, **self.decode_kwargs)
+            printable_text = text[self.print_len :]
+            self.token_cache = []
+            self.print_len = 0
+        else:
+            printable_text = ""
+
+        self.next_tokens_are_prompt = True
+        self.on_finalized_text(printable_text, stream_end=True)
+
+    def on_finalized_text(self, text: str, stream_end: bool = False):
+        """Prints the new text to stdout. If the stream is ending, also prints a newline."""
+        print(text, flush=True, end="" if not stream_end else None)
+
+    def _is_chinese_char(self, cp):
+        """Checks whether CP is the codepoint of a CJK character."""
+        # This defines a "chinese character" as anything in the CJK Unicode block:
+        #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
+        #
+        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
+        # despite its name. The modern Korean Hangul alphabet is a different block,
+        # as is Japanese Hiragana and Katakana. Those alphabets are used to write
+        # space-separated words, so they are not treated specially and handled
+        # like the all of the other languages.
+        if (
+            (cp >= 0x4E00 and cp <= 0x9FFF)
+            or (cp >= 0x3400 and cp <= 0x4DBF)  #
+            or (cp >= 0x20000 and cp <= 0x2A6DF)  #
+            or (cp >= 0x2A700 and cp <= 0x2B73F)  #
+            or (cp >= 0x2B740 and cp <= 0x2B81F)  #
+            or (cp >= 0x2B820 and cp <= 0x2CEAF)  #
+            or (cp >= 0xF900 and cp <= 0xFAFF)
+            or (cp >= 0x2F800 and cp <= 0x2FA1F)  #
+        ):  #
+            return True
+
+        return False
+
+
+class TextIteratorStreamer(TextStreamer):
+    """
+    Streamer that stores print-ready text in a queue, to be used by a downstream application as an iterator. This is
+    useful for applications that benefit from acessing the generated text in a non-blocking way (e.g. in an interactive
+    Gradio demo).
+
+    <Tip warning={true}>
+
+    The API for the streamer classes is still under development and may change in the future.
+
+    </Tip>
+
+    Parameters:
+        tokenizer (`AutoTokenizer`):
+            The tokenized used to decode the tokens.
+        skip_prompt (`bool`, *optional*, defaults to `False`):
+            Whether to skip the prompt to `.generate()` or not. Useful e.g. for chatbots.
+        timeout (`float`, *optional*):
+            The timeout for the text queue. If `None`, the queue will block indefinitely. Useful to handle exceptions
+            in `.generate()`, when it is called in a separate thread.
+        decode_kwargs (`dict`, *optional*):
+            Additional keyword arguments to pass to the tokenizer's `decode` method.
+
+    Examples:
+
+        ```python
+        >>> from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
+        >>> from threading import Thread
+
+        >>> tok = AutoTokenizer.from_pretrained("openai-community/gpt2")
+        >>> model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2")
+        >>> inputs = tok(["An increasing sequence: one,"], return_tensors="pt")
+        >>> streamer = TextIteratorStreamer(tok)
+
+        >>> # Run the generation in a separate thread, so that we can fetch the generated text in a non-blocking way.
+        >>> generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=20)
+        >>> thread = Thread(target=model.generate, kwargs=generation_kwargs)
+        >>> thread.start()
+        >>> generated_text = ""
+        >>> for new_text in streamer:
+        ...     generated_text += new_text
+        >>> generated_text
+        'An increasing sequence: one, two, three, four, five, six, seven, eight, nine, ten, eleven,'
+        ```
+    """
+
+    def __init__(
+        self, tokenizer: "AutoTokenizer", skip_prompt: bool = False, timeout: Optional[float] = None, **decode_kwargs
+    ):
+        super().__init__(tokenizer, skip_prompt, **decode_kwargs)
+        self.text_queue = Queue()
+        self.stop_signal = None
+        self.timeout = timeout
+
+    def on_finalized_text(self, text: str, stream_end: bool = False):
+        """Put the new text in the queue. If the stream is ending, also put a stop signal in the queue."""
+        self.text_queue.put(text, timeout=self.timeout)
+        if stream_end:
+            self.text_queue.put(self.stop_signal, timeout=self.timeout)
+
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        value = self.text_queue.get(timeout=self.timeout)
+        if value == self.stop_signal:
+            raise StopIteration()
+        else:
+            return value
+
+
+class AsyncTextIteratorStreamer(TextStreamer):
+    """
+    Streamer that stores print-ready text in a queue, to be used by a downstream application as an async iterator.
+    This is useful for applications that benefit from acessing the generated text asynchronously (e.g. in an
+    interactive Gradio demo).
+
+    <Tip warning={true}>
+
+    The API for the streamer classes is still under development and may change in the future.
+
+    </Tip>
+
+    Parameters:
+        tokenizer (`AutoTokenizer`):
+            The tokenized used to decode the tokens.
+        skip_prompt (`bool`, *optional*, defaults to `False`):
+            Whether to skip the prompt to `.generate()` or not. Useful e.g. for chatbots.
+        timeout (`float`, *optional*):
+            The timeout for the text queue. If `None`, the queue will block indefinitely. Useful to handle exceptions
+            in `.generate()`, when it is called in a separate thread.
+        decode_kwargs (`dict`, *optional*):
+            Additional keyword arguments to pass to the tokenizer's `decode` method.
+
+    Raises:
+        TimeoutError: If token generation time exceeds timeout value.
+
+    Examples:
+
+        ```python
+        >>> from transformers import AutoModelForCausalLM, AutoTokenizer, AsyncTextIteratorStreamer
+        >>> from threading import Thread
+        >>> import asyncio
+
+        >>> tok = AutoTokenizer.from_pretrained("openai-community/gpt2")
+        >>> model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2")
+        >>> inputs = tok(["An increasing sequence: one,"], return_tensors="pt")
+
+        >>> # Run the generation in a separate thread, so that we can fetch the generated text in a non-blocking way.
+        >>> async def main():
+        ...     # Important: AsyncTextIteratorStreamer must be initialized inside a coroutine!
+        ...     streamer = AsyncTextIteratorStreamer(tok)
+        ...     generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=20)
+        ...     thread = Thread(target=model.generate, kwargs=generation_kwargs)
+        ...     thread.start()
+        ...     generated_text = ""
+        ...     async for new_text in streamer:
+        ...         generated_text += new_text
+        >>>     print(generated_text)
+        >>> asyncio.run(main())
+        An increasing sequence: one, two, three, four, five, six, seven, eight, nine, ten, eleven,
+        ```
+    """
+
+    def __init__(
+        self, tokenizer: "AutoTokenizer", skip_prompt: bool = False, timeout: float | None = None, **decode_kwargs
+    ):
+        super().__init__(tokenizer, skip_prompt, **decode_kwargs)
+        self.text_queue = asyncio.Queue()
+        self.stop_signal = None
+        self.timeout = timeout
+        self.loop = asyncio.get_running_loop()
+        self.has_asyncio_timeout = hasattr(asyncio, "timeout")
+
+    def on_finalized_text(self, text: str, stream_end: bool = False):
+        """Put the new text in the queue. If the stream is ending, also put a stop signal in the queue."""
+        self.loop.call_soon_threadsafe(self.text_queue.put_nowait, text)
+        if stream_end:
+            self.loop.call_soon_threadsafe(self.text_queue.put_nowait, self.stop_signal)
+
+    def __aiter__(self):
+        return self
+
+    async def __anext__(self):
+        try:
+            if self.has_asyncio_timeout:
+                async with asyncio.timeout(self.timeout):
+                    value = await self.text_queue.get()
+            else:
+                value = await asyncio.wait_for(self.text_queue.get(), timeout=self.timeout)
+        except asyncio.TimeoutError:
+            raise TimeoutError()
+        else:
+            if value == self.stop_signal:
+                raise StopAsyncIteration()
+            else:
+                return value
diff --git a/.venv/lib/python3.11/site-packages/transformers/generation/tf_logits_process.py b/.venv/lib/python3.11/site-packages/transformers/generation/tf_logits_process.py
new file mode 100644
index 0000000000000000000000000000000000000000..f70655fb7c1d0b502dfc8f1c4f2c491867aa7e8c
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/generation/tf_logits_process.py
@@ -0,0 +1,603 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import List, Tuple
+
+import numpy as np
+import tensorflow as tf
+
+from ..tf_utils import stable_softmax
+from ..utils import add_start_docstrings
+from ..utils.logging import get_logger
+
+
+logger = get_logger(__name__)
+
+
+TF_LOGITS_PROCESSOR_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`PreTrainedTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        scores (`tf.Tensor` of shape `(batch_size, config.vocab_size)`):
+            Prediction scores of a language modeling head. These can be logits for each vocabulary when not using beam
+            search or log softmax for each vocabulary token when using beam search.
+        cur_len (`int`):
+            The current length of valid input sequence tokens. In the TF implementation, the input_ids' sequence length
+            is the maximum length generate can produce, and we need to know which of its tokens are valid.
+        kwargs (`Dict[str, Any]`, *optional*):
+            Additional logits processor specific kwargs.
+
+    Return:
+        `tf.Tensor` of shape `(batch_size, config.vocab_size)`: The processed prediction scores.
+"""
+
+
+class TFLogitsProcessor:
+    """Abstract base class for all logit processors that can be applied during generation."""
+
+    @add_start_docstrings(TF_LOGITS_PROCESSOR_INPUTS_DOCSTRING)
+    def __call__(self, input_ids: tf.Tensor, scores: tf.Tensor, cur_len: int) -> tf.Tensor:
+        """TF method for processing logits."""
+        raise NotImplementedError(
+            f"{self.__class__} is an abstract class. Only classes inheriting this class can be called."
+        )
+
+
+class TFLogitsWarper:
+    """Abstract base class for all logit warpers that can be applied during generation with multinomial sampling."""
+
+    @add_start_docstrings(TF_LOGITS_PROCESSOR_INPUTS_DOCSTRING)
+    def __call__(self, input_ids: tf.Tensor, scores: tf.Tensor, cur_len: int) -> tf.Tensor:
+        """TF method for warping logits."""
+        raise NotImplementedError(
+            f"{self.__class__} is an abstract class. Only classes inheriting this class can be called."
+        )
+
+
+class TFLogitsProcessorList(list):
+    """
+    This class can be used to create a list of [`TFLogitsProcessor`] to subsequently process a `scores` input tensor.
+    This class inherits from list and adds a specific *__call__* method to apply each [`TFLogitsProcessor`] to the
+    inputs.
+    """
+
+    @add_start_docstrings(TF_LOGITS_PROCESSOR_INPUTS_DOCSTRING)
+    def __call__(self, input_ids: tf.Tensor, scores: tf.Tensor, cur_len: int, **kwargs) -> tf.Tensor:
+        for processor in self:
+            function_args = inspect.signature(processor.__call__).parameters
+            if len(function_args) > 3:
+                if not all(arg in kwargs for arg in list(function_args.keys())[2:]):
+                    raise ValueError(
+                        f"Make sure that all the required parameters: {list(function_args.keys())} for "
+                        f"{processor.__class__} are passed to the logits processor."
+                    )
+                scores = processor(input_ids, scores, cur_len, **kwargs)
+            else:
+                scores = processor(input_ids, scores, cur_len)
+        return scores
+
+
+class TFTemperatureLogitsWarper(TFLogitsWarper):
+    r"""
+    [`TFLogitsWarper`] for temperature (exponential scaling output probability distribution).
+
+    Args:
+        temperature (`float`):
+            The value used to module the logits distribution.
+    """
+
+    def __init__(self, temperature: float):
+        if not isinstance(temperature, float) or not (temperature > 0):
+            raise ValueError(f"`temperature` has to be a strictly positive float, but is {temperature}")
+
+        self.temperature = temperature
+
+    def __call__(self, input_ids: tf.Tensor, scores: tf.Tensor, cur_len: int) -> tf.Tensor:
+        scores = scores / self.temperature
+        return scores
+
+
+class TFTopKLogitsWarper(TFLogitsWarper):
+    r"""
+    [`TFLogitsWarper`] that performs top-k, i.e. restricting to the k highest probability elements.
+
+    Args:
+        top_k (`int`):
+            The number of highest probability vocabulary tokens to keep for top-k-filtering.
+        filter_value (`float`, *optional*, defaults to -inf):
+            All filtered values will be set to this float value.
+        min_tokens_to_keep (`int`, *optional*, defaults to 1):
+            Minimum number of tokens that cannot be filtered.
+    """
+
+    def __init__(self, top_k: int, filter_value: float = -float("Inf"), min_tokens_to_keep: int = 1):
+        if not isinstance(top_k, int) or top_k <= 0:
+            raise ValueError(f"`top_k` has to be a strictly positive integer, but is {top_k}")
+
+        self.top_k = max(top_k, min_tokens_to_keep)
+        self.filter_value = filter_value
+
+    def __call__(self, input_ids: tf.Tensor, scores: tf.Tensor, cur_len: int) -> tf.Tensor:
+        top_k = min(self.top_k, scores.shape[-1])  # Safety check
+        # Boolean mask containing all tokens with a probability less than the last token of the top-k
+        indices_to_remove = scores < tf.math.top_k(scores, k=top_k)[0][..., -1:]
+        next_scores = tf.where(indices_to_remove, self.filter_value, scores)
+        return next_scores
+
+
+class TFTopPLogitsWarper(TFLogitsWarper):
+    """
+    [`TFLogitsWarper`] that performs top-p, i.e. restricting to top tokens summing to <= prob_cut_off.
+
+    Args:
+        top_p (`float`):
+            If set to < 1, only the smallest set of most probable tokens with probabilities that add up to `top_p` or
+            higher are kept for generation.
+        filter_value (`float`, *optional*, defaults to -inf):
+            All filtered values will be set to this float value.
+        min_tokens_to_keep (`int`, *optional*, defaults to 1):
+            Minimum number of tokens that cannot be filtered.
+    """
+
+    def __init__(self, top_p: float, filter_value: float = -float("Inf"), min_tokens_to_keep: int = 1):
+        if not isinstance(top_p, float) or (top_p < 0 or top_p > 1.0):
+            raise ValueError(f"`top_p` has to be a float > 0 and < 1, but is {top_p}")
+        if not isinstance(min_tokens_to_keep, int) or (min_tokens_to_keep < 1):
+            raise ValueError(f"`min_tokens_to_keep` has to be a positive integer, but is {min_tokens_to_keep}")
+
+        self.top_p = top_p
+        self.filter_value = filter_value
+        self.min_tokens_to_keep = min_tokens_to_keep
+
+    def __call__(self, input_ids: tf.Tensor, scores: tf.Tensor, cur_len: int) -> tf.Tensor:
+        topk_scores, topk_indices = tf.math.top_k(scores, scores.shape[-1])
+
+        mask_scores = tf.fill(scores.shape, self.filter_value)
+        cumulative_probs = tf.math.cumsum(stable_softmax(topk_scores, axis=-1), axis=-1)
+        score_mask = cumulative_probs < self.top_p
+
+        # Also include the token that is higher than top_p (the first false = shift and insert a True on the left)
+        score_mask = tf.concat((tf.ones([score_mask.shape[0], 1], dtype=tf.bool), score_mask[:, :-1]), axis=-1)
+
+        # Ensure min tokens to keep
+        score_mask = tf.concat(
+            (
+                tf.ones([score_mask.shape[0], self.min_tokens_to_keep], dtype=tf.bool),
+                score_mask[:, self.min_tokens_to_keep :],
+            ),
+            axis=-1,
+        )
+
+        # Mask the values that do not fit the criteria
+        topk_next_scores = tf.where(score_mask, topk_scores, mask_scores)
+
+        # Undo the topk sorting: converts the 2D matrix of per-row original indices of shape (batch_size, vocab_size)
+        # to a 3D tensor of shape (batch_size, vocab_size, 2) containing the original score coordinate, from which we
+        # can scatter (i.e. `scatter_indices[row, col, :]` is a tensor containing `[row, topk_indices[row, col]]`)
+        scatter_rows = tf.tile(tf.expand_dims(tf.range(topk_indices.shape[0]), axis=-1), [1, topk_indices.shape[-1]])
+        scatter_indices = tf.stack((scatter_rows, topk_indices), axis=-1)
+        next_scores = tf.scatter_nd(scatter_indices, topk_next_scores, shape=topk_next_scores.shape)
+
+        return next_scores
+
+
+class TFMinLengthLogitsProcessor(TFLogitsProcessor):
+    r"""
+    [`TFLogitsProcessor`] enforcing a min-length by setting EOS probability to 0.
+
+    Args:
+        min_length (`int`):
+            The minimum length below which the score of `eos_token_id` is set to `-float("Inf")`.
+        eos_token_id (`int`):
+            The id of the *end-of-sequence* token.
+    """
+
+    def __init__(self, min_length: int, eos_token_id: int):
+        if not isinstance(min_length, int) or min_length < 0:
+            raise ValueError(f"`min_length` has to be a positive integer, but is {min_length}")
+
+        if not isinstance(eos_token_id, int) or eos_token_id < 0:
+            raise ValueError(f"`eos_token_id` has to be a positive integer, but is {eos_token_id}")
+
+        self.min_length = min_length
+        self.eos_token_id = eos_token_id
+
+    def _apply_eos_token_mask(self, scores: tf.Tensor) -> tf.Tensor:
+        eos_token_id_mask = tf.range(scores.shape[-1]) == self.eos_token_id
+        scores = tf.where(eos_token_id_mask, float("-inf"), scores)
+        return scores
+
+    def __call__(self, input_ids: tf.Tensor, scores: tf.Tensor, cur_len: int) -> tf.Tensor:
+        # applies eos token masking if the first argument is true
+        scores = tf.cond(
+            tf.less(cur_len, self.min_length),
+            lambda: self._apply_eos_token_mask(scores),
+            lambda: tf.identity(scores),
+        )
+        return scores
+
+
+class TFRepetitionPenaltyLogitsProcessor(TFLogitsProcessor):
+    r"""
+    [`TFLogitsProcessor`] enforcing an exponential penalty on repeated sequences.
+
+    Args:
+        repetition_penalty (`float`):
+            The parameter for repetition penalty. 1.0 means no penalty. See [this
+            paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
+    """
+
+    def __init__(self, penalty: float):
+        if not isinstance(penalty, float) or not (penalty > 0):
+            raise ValueError(f"`penalty` has to be a strictly positive float, but is {penalty}")
+
+        self.penalty = penalty
+
+    def _create_score_penalties(self, input_ids: tf.Tensor, logits: tf.Tensor) -> tf.Tensor:
+        # We want to populate the penalties in the positions of `input_ids`. Since XLA can't handle shapes unknown
+        # before runtime, `tf.unique` can't be used. Therefore, we may have redundant updates, when a given row has
+        # the same token multiple times.
+
+        # Gathers the penalties to apply
+        logit_penalties = tf.gather(logits, input_ids, axis=1, batch_dims=1)
+        logit_penalties = tf.where(logit_penalties > 0, 1 / self.penalty, logit_penalties)
+        logit_penalties = tf.where(logit_penalties < 0, self.penalty, logit_penalties)
+
+        # Scatters the penalties
+        token_penalties = tf.ones(logits.shape)
+        batch_size = input_ids.shape[0]
+        seq_len = tf.shape(input_ids)[1]  # the sequence length has dynamic size, hence the dynamic shape
+        indexable_prev_input_ids = tf.concat(
+            (
+                tf.expand_dims(tf.repeat(tf.range(batch_size), seq_len), axis=-1),
+                tf.expand_dims(tf.reshape(input_ids, [-1]), axis=-1),
+            ),
+            axis=1,
+        )
+        token_penalties = tf.tensor_scatter_nd_update(
+            token_penalties, indices=indexable_prev_input_ids, updates=tf.reshape(logit_penalties, [-1])
+        )
+        return token_penalties
+
+    def __call__(self, input_ids: tf.Tensor, scores: tf.Tensor, cur_len: int) -> tf.Tensor:
+        score_penalties = self._create_score_penalties(input_ids[:, :cur_len], scores)
+
+        scores = tf.math.multiply(scores, score_penalties)
+
+        return scores
+
+
+class TFNoBadWordsLogitsProcessor(TFLogitsProcessor):
+    """
+    [`TFLogitsProcessor`] that enforces that specified sequences will never be sampled.
+
+    Args:
+        bad_words_ids (`List[List[int]]`):
+            List of list of token ids that are not allowed to be generated. In order to get the tokens of the words
+            that should not appear in the generated text, make sure to set `add_prefix_space=True` when initializing
+            the tokenizer, and use `tokenizer(bad_words, add_special_tokens=False).input_ids`. The `add_prefix_space`
+            argument is only supported for some slow tokenizers, as fast tokenizers' prefixing behaviours come from
+            `pre tokenizers`. Read more [here](https://huggingface.co/docs/tokenizers/api/pre-tokenizers).
+        eos_token_id (`int`):
+            The id of the *end-of-sequence* token.
+    """
+
+    def __init__(self, bad_words_ids: List[List[int]], eos_token_id: int):
+        if not isinstance(bad_words_ids, List) or len(bad_words_ids) == 0:
+            raise ValueError(f"`bad_words_ids` has to be a non-empty list, but is {bad_words_ids}.")
+        if any(not isinstance(bad_word_ids, list) for bad_word_ids in bad_words_ids):
+            raise ValueError(f"`bad_words_ids` has to be a list of lists, but is {bad_words_ids}.")
+        if any(
+            any((not isinstance(token_id, (int, np.integer)) or token_id < 0) for token_id in bad_word_ids)
+            for bad_word_ids in bad_words_ids
+        ):
+            raise ValueError(
+                f"Each list in `bad_words_ids` has to be a list of positive integers, but is {bad_words_ids}."
+            )
+
+        # stores the information about bad words in three tensors:
+        # 1. a rectangular tensor with the forbidden sequences (padded with `-1`), for full data comparisons
+        self.bad_word_seqs_ids = tf.ragged.constant(bad_words_ids).to_tensor(default_value=-1)
+        # 2. a tensor with the unpadded length of each forbidden sequence, for quick length comparisons
+        bad_word_seqs_len = [len(bad_words) for bad_words in bad_words_ids]
+        if any(word_len == 0 for word_len in bad_word_seqs_len):
+            raise ValueError(f"Banned words token sequences {bad_words_ids} cannot have an empty list")
+        self.bad_word_seqs_len = tf.convert_to_tensor(bad_word_seqs_len, dtype=tf.int32)
+        # 3. a tensor containing the last token for each sequence, for easy access to the tokens that may be banned
+        self.seq_forbidden_tokens = tf.convert_to_tensor([bad_words[-1] for bad_words in bad_words_ids])
+
+    def _calc_row_banned_bad_tokens(self, row_input_ids: tf.Tensor) -> tf.Tensor:
+        def _tokens_match(bad_word_seq_number):
+            def _len_one():
+                # If the bad sequence only has one token, always mask it
+                return tf.cond(
+                    tf.math.equal(self.bad_word_seqs_len[bad_word_seq_number], 1),
+                    lambda: tf.ones((), dtype=tf.bool),
+                    _len_greater_than_cur_len,
+                )
+
+            def _len_greater_than_cur_len():
+                # Otherwise, if the bad sequence is longer than the current length they can't ever match
+                return tf.cond(
+                    tf.math.greater(self.bad_word_seqs_len[bad_word_seq_number], tf.shape(row_input_ids)[0]),
+                    lambda: tf.zeros((), dtype=tf.bool),
+                    _match_found,
+                )
+
+            def _match_found():
+                # Finaly, runs the actual comparison. Can only be called if the previous comparisons do not yield
+                # an answer (otherwise we get indexing exceptions)
+                compare_len = self.bad_word_seqs_len[bad_word_seq_number] - 1
+                return tf.cond(
+                    tf.math.reduce_all(
+                        tf.math.equal(
+                            row_input_ids[-compare_len:], self.bad_word_seqs_ids[bad_word_seq_number, :compare_len]
+                        )
+                    ),
+                    lambda: tf.ones((), dtype=tf.bool),
+                    lambda: tf.zeros((), dtype=tf.bool),
+                )
+
+            match = _len_one()
+            return match
+
+        # Compares the current row against all bad word sequences, obtaining a mask with the matches.
+        match_mask = tf.map_fn(_tokens_match, tf.range(self.bad_word_seqs_ids.shape[0]), fn_output_signature=tf.bool)
+        row_banned_tokens = self.seq_forbidden_tokens[match_mask]
+        return row_banned_tokens
+
+    def __call__(self, input_ids: tf.Tensor, scores: tf.Tensor, cur_len: int) -> tf.Tensor:
+        # We want to mask some banned tokens, at a score level. Since the banned tokens depend on the previous
+        # `input_ids`, they may have a different length for each row, and they may even be empty for some rows.
+        # To remain simple and XLA-compatible, we work on a per-row fashion.
+        # TODO (Joao): this function might trigger XLA retracing as `cur_len` increases. Fix it if it becomes
+        # a frequent choke point. (make `cur_len` a tensor?)
+        def _get_row_updated_score(row_inputs: Tuple[tf.Tensor]) -> tf.Tensor:
+            row_input_ids, row_score = row_inputs
+            banned_tokens = self._calc_row_banned_bad_tokens(row_input_ids[:cur_len])
+            banned_tokens_mask = tf.scatter_nd(
+                indices=tf.expand_dims(banned_tokens, axis=-1),
+                updates=tf.ones_like(banned_tokens, dtype=tf.bool),
+                shape=row_score.shape,
+            )
+            row_score = tf.where(banned_tokens_mask, -float("inf"), row_score)
+            return row_score
+
+        scores = tf.map_fn(_get_row_updated_score, (input_ids, scores), fn_output_signature=tf.float32)
+        return scores
+
+
+class TFNoRepeatNGramLogitsProcessor(TFLogitsProcessor):
+    r"""
+    [`TFLogitsProcessor`] that enforces no repetition of n-grams. See
+    [Fairseq](https://github.com/pytorch/fairseq/blob/a07cb6f40480928c9e0548b737aadd36ee66ac76/fairseq/sequence_generator.py#L345).
+
+    Args:
+        ngram_size (`int`):
+            All ngrams of size `ngram_size` can only occur once.
+    """
+
+    def __init__(self, ngram_size: int):
+        if not isinstance(ngram_size, int) or ngram_size <= 0:
+            raise ValueError(f"`ngram_size` has to be a strictly positive integer, but is {ngram_size}")
+        self.ngram_size = ngram_size
+
+    def calc_banned_ngram_tokens(self, input_ids, num_hypos, cur_len):
+        # Copied from fairseq for no_repeat_ngram in beam_search
+        if cur_len + 1 < self.ngram_size:
+            # return no banned tokens if we haven't generated ngram_size tokens yet
+            return [[] for _ in range(num_hypos)]
+        generated_ngrams = [{} for _ in range(num_hypos)]
+        prev_input_ids = input_ids[:, :cur_len]
+        for idx in range(num_hypos):
+            gen_tokens = prev_input_ids[idx].numpy().tolist()
+            generated_ngram = generated_ngrams[idx]
+            for ngram in zip(*[gen_tokens[i:] for i in range(self.ngram_size)]):
+                prev_ngram_tuple = tuple(ngram[:-1])
+                generated_ngram[prev_ngram_tuple] = generated_ngram.get(prev_ngram_tuple, []) + [ngram[-1]]
+
+        def _get_generated_ngrams(hypo_idx):
+            # Before decoding the next token, prevent decoding of ngrams that have already appeared
+            start_idx = cur_len + 1 - self.ngram_size
+            ngram_idx = tuple(prev_input_ids[hypo_idx, start_idx:cur_len].numpy().tolist())
+            return generated_ngrams[hypo_idx].get(ngram_idx, [])
+
+        banned_tokens = [_get_generated_ngrams(hypo_idx) for hypo_idx in range(num_hypos)]
+
+        return banned_tokens
+
+    def __call__(self, input_ids: tf.Tensor, scores: tf.Tensor, cur_len: int) -> tf.Tensor:
+        # TODO (joao): enable XLA on this logits processor. See discussion and attempts in
+        # https://github.com/huggingface/transformers/pull/16974
+        if not tf.executing_eagerly():
+            raise NotImplementedError("TFNoRepeatNGramLogitsProcessor is only implemented for eager execution.")
+
+        batch_size, vocab_size = scores.shape
+        banned_tokens = self.calc_banned_ngram_tokens(input_ids, batch_size, cur_len)
+
+        # create banned_tokens boolean mask
+        banned_tokens_indices_mask = []
+        for banned_tokens_slice in banned_tokens:
+            banned_tokens_indices_mask.append(
+                [True if token in banned_tokens_slice else False for token in range(vocab_size)]
+            )
+
+        scores = tf.where(tf.convert_to_tensor(banned_tokens_indices_mask, dtype=tf.bool), -float("inf"), scores)
+
+        return scores
+
+
+class TFForcedBOSTokenLogitsProcessor(TFLogitsProcessor):
+    r"""
+    [`TFLogitsProcessor`] that enforces the specified token as the first generated token.
+
+    Args:
+        bos_token_id (`int`):
+            The id of the token to force as the first generated token.
+    """
+
+    def __init__(self, bos_token_id: int):
+        if bos_token_id < 0:
+            raise ValueError(f"The forced bos token id  must be a non-negative integer, got {bos_token_id}")
+        self.bos_token_id = bos_token_id
+
+    def __call__(self, input_ids: tf.Tensor, scores: tf.Tensor, cur_len: int) -> tf.Tensor:
+        if cur_len == 1:
+            batch_size, num_tokens = scores.shape
+            # sets the score to 0 in the bos_token_id column
+            scores = tf.zeros((batch_size, 1))
+            # sets the score to -inf everywhere else
+            if self.bos_token_id > 0:
+                scores = tf.concat((tf.broadcast_to(-float("inf"), (batch_size, self.bos_token_id)), scores), axis=-1)
+            if self.bos_token_id < (num_tokens - 1):
+                scores = tf.concat(
+                    (scores, tf.broadcast_to(-float("inf"), (batch_size, (num_tokens - 1) - self.bos_token_id))),
+                    axis=-1,
+                )
+        return scores
+
+
+class TFForcedEOSTokenLogitsProcessor(TFLogitsProcessor):
+    r"""
+    [`TFLogitsProcessor`] that enforces the specified token as the last generated token when `max_length` is reached.
+
+    Args:
+        max_length (`int`):
+            The maximum length of the sequence to be generated.
+        eos_token_id (`int`):
+            The id of the token to force as the last generated token when `max_length` is reached.
+    """
+
+    def __init__(self, max_length: int, eos_token_id: int):
+        self.max_length = max_length
+        if eos_token_id < 0:
+            raise ValueError(f"The forced eos token id must be a non-negative integer, got {eos_token_id}")
+        self.eos_token_id = eos_token_id
+
+    def __call__(self, input_ids: tf.Tensor, scores: tf.Tensor, cur_len: int) -> tf.Tensor:
+        if cur_len == self.max_length - 1:
+            batch_size, num_tokens = scores.shape
+            # sets the score to 0 in the eos_token_id column
+            scores = tf.zeros((batch_size, 1))
+            # sets the score to -inf everywhere else
+            if self.eos_token_id > 0:
+                scores = tf.concat((tf.broadcast_to(-float("inf"), (batch_size, self.eos_token_id)), scores), axis=-1)
+            if self.eos_token_id < (num_tokens - 1):
+                scores = tf.concat(
+                    (scores, tf.broadcast_to(-float("inf"), (batch_size, (num_tokens - 1) - self.eos_token_id))),
+                    axis=-1,
+                )
+        return scores
+
+
+class TFSuppressTokensAtBeginLogitsProcessor(TFLogitsProcessor):
+    r"""
+    [`TFSuppressTokensAtBeginLogitsProcessor`] suppresses a list of tokens as soon as the `generate` function starts
+    generating using `begin_index` tokens. This should ensure that the tokens defined by `begin_suppress_tokens` at not
+    sampled at the beginning of the generation.
+    """
+
+    def __init__(self, begin_suppress_tokens, begin_index):
+        self.begin_suppress_tokens = list(begin_suppress_tokens)
+        self.begin_index = begin_index
+
+    def __call__(self, input_ids: tf.Tensor, scores: tf.Tensor, cur_len: int) -> tf.Tensor:
+        suppressed_indices = []
+        for token in self.begin_suppress_tokens:
+            if token < scores.shape[-1]:  # to ensure we don't go beyond the vocab size
+                suppressed_indices.extend([[i, token] for i in range(scores.shape[0])])
+
+        if len(suppressed_indices) > 0:
+            scores = tf.cond(
+                tf.equal(cur_len, self.begin_index),
+                lambda: tf.tensor_scatter_nd_update(
+                    scores,
+                    indices=suppressed_indices,
+                    updates=[-float("inf") for _ in range(scores.shape[0] * len(self.begin_suppress_tokens))],
+                ),
+                lambda: scores,
+            )
+        return scores
+
+
+class TFSuppressTokensLogitsProcessor(TFLogitsProcessor):
+    r"""This processor can be used to suppress a list of tokens. The processor will set their log probs to `-inf` so that they
+    are not sampled."""
+
+    def __init__(self, suppress_tokens):
+        self.suppress_tokens = list(suppress_tokens)
+
+    def __call__(self, input_ids: tf.Tensor, scores: tf.Tensor, cur_len: int) -> tf.Tensor:
+        suppressed_indices = []
+        for token in self.suppress_tokens:
+            if token < scores.shape[-1]:  # to ensure we don't go beyond the vocab size
+                suppressed_indices.extend([[i, token] for i in range(scores.shape[0])])
+
+        if len(suppressed_indices) > 0:
+            scores = tf.tensor_scatter_nd_update(
+                scores,
+                indices=[[i, token] for i in range(scores.shape[0]) for token in self.suppress_tokens],
+                updates=[-float("inf") for _ in range(scores.shape[0] * len(self.suppress_tokens))],
+            )
+        return scores
+
+
+class TFForceTokensLogitsProcessor(TFLogitsProcessor):
+    r"""This processor takes a list of pairs of integers which indicates a mapping from generation indices to token
+    indices that will be forced before sampling. The processor will set their log probs to `0` and all other tokens to
+    `-inf` so that they are sampled at their corresponding index."""
+
+    def __init__(self, force_token_map: List[List[int]]):
+        force_token_map = dict(force_token_map)
+        # Converts the dictionary of format {index: token} containing the tokens to be forced to an array, where the
+        # index of the array corresponds to the index of the token to be forced, for XLA compatibility.
+        # Indexes without forced tokens will have an negative value.
+        force_token_array = np.ones((max(force_token_map.keys()) + 1), dtype=np.int32) * -1
+        for index, token in force_token_map.items():
+            if token is not None:
+                force_token_array[index] = token
+        self.force_token_array = tf.convert_to_tensor(force_token_array, dtype=tf.int32)
+
+    def __call__(self, input_ids: tf.Tensor, scores: tf.Tensor, cur_len: int) -> tf.Tensor:
+        def _force_token(generation_idx):
+            batch_size = scores.shape[0]
+            current_token = self.force_token_array[generation_idx]
+
+            new_scores = tf.zeros_like(scores, dtype=scores.dtype) + tf.constant([scores.dtype.min])
+            indices = tf.stack((tf.range(batch_size), tf.tile([current_token], [batch_size])), axis=1)
+            updates = tf.zeros((batch_size,), dtype=scores.dtype)
+            new_scores = tf.tensor_scatter_nd_update(new_scores, indices, updates)
+            return new_scores
+
+        scores = tf.cond(
+            tf.greater_equal(cur_len, tf.shape(self.force_token_array)[0]),
+            # If the current length is geq than the length of force_token_array, the processor does nothing.
+            lambda: tf.identity(scores),
+            # Otherwise, it may force a certain token.
+            lambda: tf.cond(
+                tf.greater_equal(self.force_token_array[cur_len], 0),
+                # Only valid (positive) tokens are forced
+                lambda: _force_token(cur_len),
+                # Otherwise, the processor does nothing.
+                lambda: scores,
+            ),
+        )
+        return scores
diff --git a/.venv/lib/python3.11/site-packages/transformers/generation/tf_utils.py b/.venv/lib/python3.11/site-packages/transformers/generation/tf_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..dd1f819fe548aa589ac11c8d3e3ef3338d3d208c
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/generation/tf_utils.py
@@ -0,0 +1,3130 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import inspect
+import warnings
+from dataclasses import dataclass
+from typing import Any, Dict, Optional, Tuple, Union
+
+import numpy as np
+import tensorflow as tf
+from tensorflow.compiler.tf2xla.python.xla import dynamic_update_slice
+
+from ..modeling_tf_outputs import TFCausalLMOutputWithPast, TFSeq2SeqLMOutput
+from ..models.auto import (
+    TF_MODEL_FOR_CAUSAL_LM_MAPPING,
+    TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
+    TF_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING,
+    TF_MODEL_FOR_VISION_2_SEQ_MAPPING,
+)
+from ..tf_utils import shape_list, stable_softmax
+from ..utils import ModelOutput, logging
+from .configuration_utils import GenerationConfig
+from .tf_logits_process import (
+    TFForcedBOSTokenLogitsProcessor,
+    TFForcedEOSTokenLogitsProcessor,
+    TFForceTokensLogitsProcessor,
+    TFLogitsProcessorList,
+    TFMinLengthLogitsProcessor,
+    TFNoBadWordsLogitsProcessor,
+    TFNoRepeatNGramLogitsProcessor,
+    TFRepetitionPenaltyLogitsProcessor,
+    TFSuppressTokensAtBeginLogitsProcessor,
+    TFSuppressTokensLogitsProcessor,
+    TFTemperatureLogitsWarper,
+    TFTopKLogitsWarper,
+    TFTopPLogitsWarper,
+)
+
+
+logger = logging.get_logger(__name__)
+
+
+@dataclass
+class TFGreedySearchDecoderOnlyOutput(ModelOutput):
+    """
+    Base class for outputs of decoder-only generation models using greedy search.
+
+
+    Args:
+        sequences (`tf.Tensor` of shape `(batch_size, sequence_length)`):
+            The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or shorter
+            if all batches finished early due to the `eos_token_id`.
+        scores (`tuple(tf.Tensor)` *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
+            Processed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax)
+            at each generation step. Tuple of `tf.Tensor` with up to `max_new_tokens` elements (one element for each
+            generated token), with each tensor of shape `(batch_size, config.vocab_size)`.
+        attentions (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
+            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
+            `tf.Tensor` of shape `(batch_size, num_heads, generated_length, sequence_length)`.
+        hidden_states (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
+            `tf.Tensor` of shape `(batch_size, generated_length, hidden_size)`.
+    """
+
+    sequences: tf.Tensor = None
+    scores: Optional[Tuple[tf.Tensor]] = None
+    attentions: Optional[Tuple[Tuple[tf.Tensor]]] = None
+    hidden_states: Optional[Tuple[Tuple[tf.Tensor]]] = None
+
+
+@dataclass
+class TFGreedySearchEncoderDecoderOutput(ModelOutput):
+    """
+    Base class for outputs of encoder-decoder generation models using greedy search. Hidden states and attention
+    weights of the decoder (respectively the encoder) can be accessed via the encoder_attentions and the
+    encoder_hidden_states attributes (respectively the decoder_attentions and the decoder_hidden_states attributes)
+
+
+    Args:
+        sequences (`tf.Tensor` of shape `(batch_size, sequence_length)`):
+            The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or shorter
+            if all batches finished early due to the `eos_token_id`.
+        scores (`tuple(tf.Tensor)` *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
+            Processed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax)
+            at each generation step. Tuple of `tf.Tensor` with up to `max_new_tokens` elements (one element for each
+            generated token), with each tensor of shape `(batch_size, config.vocab_size)`.
+        encoder_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
+            Tuple of `tf.Tensor` (one for each layer of the decoder) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+        encoder_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
+            `(batch_size, sequence_length, hidden_size)`.
+        decoder_attentions (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
+            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
+            `tf.Tensor` of shape `(batch_size, num_heads, generated_length, sequence_length)`.
+        cross_attentions (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
+            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
+            `tf.Tensor` of shape `(batch_size, num_heads, generated_length, sequence_length)`.
+        decoder_hidden_states (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
+            `tf.Tensor` of shape `(batch_size, generated_length, hidden_size)`.
+    """
+
+    sequences: tf.Tensor = None
+    scores: Optional[Tuple[tf.Tensor]] = None
+    encoder_attentions: Optional[Tuple[tf.Tensor]] = None
+    encoder_hidden_states: Optional[Tuple[tf.Tensor]] = None
+    decoder_attentions: Optional[Tuple[Tuple[tf.Tensor]]] = None
+    cross_attentions: Optional[Tuple[Tuple[tf.Tensor]]] = None
+    decoder_hidden_states: Optional[Tuple[Tuple[tf.Tensor]]] = None
+
+
+@dataclass
+class TFSampleDecoderOnlyOutput(ModelOutput):
+    """
+    Base class for outputs of decoder-only generation models using sampling.
+
+
+    Args:
+        sequences (`tf.Tensor` of shape `(batch_size*num_return_sequences, sequence_length)`):
+            The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or shorter
+            if all batches finished early due to the `eos_token_id`.
+        scores (`tuple(tf.Tensor)` *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
+            Processed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax)
+            at each generation step. Tuple of `tf.Tensor` with up to `max_new_tokens` elements (one element for each
+            generated token), with each tensor of shape `(batch_size*num_return_sequences, config.vocab_size)`.
+        attentions (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
+            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
+            `tf.Tensor` of shape `(num_return_sequences*batch_size, num_heads, generated_length, sequence_length)`.
+        hidden_states (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
+            `tf.Tensor` of shape `(num_return_sequences*batch_size, generated_length, hidden_size)`.
+    """
+
+    sequences: tf.Tensor = None
+    scores: Optional[Tuple[tf.Tensor]] = None
+    attentions: Optional[Tuple[Tuple[tf.Tensor]]] = None
+    hidden_states: Optional[Tuple[Tuple[tf.Tensor]]] = None
+
+
+@dataclass
+class TFSampleEncoderDecoderOutput(ModelOutput):
+    """
+    Base class for outputs of encoder-decoder generation models using sampling. Hidden states and attention weights of
+    the decoder (respectively the encoder) can be accessed via the encoder_attentions and the encoder_hidden_states
+    attributes (respectively the decoder_attentions and the decoder_hidden_states attributes)
+
+
+    Args:
+        sequences (`tf.Tensor` of shape `(batch_size*num_return_sequences, sequence_length)`):
+            The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or shorter
+            if all batches finished early due to the `eos_token_id`.
+        scores (`tuple(tf.Tensor)` *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
+            Processed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax)
+            at each generation step. Tuple of `tf.Tensor` with up to `max_new_tokens` elements (one element for each
+            generated token), with each tensor of shape `(batch_size*num_return_sequences, config.vocab_size)`.
+        encoder_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
+            Tuple of `tf.Tensor` (one for each layer of the decoder) of shape `(batch_size*num_return_sequences,
+            num_heads, sequence_length, sequence_length)`.
+        encoder_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
+            `(batch_size*num_return_sequences, sequence_length, hidden_size)`.
+        decoder_attentions (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
+            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
+            `tf.Tensor` of shape `(batch_size*num_return_sequences, num_heads, generated_length, sequence_length)`.
+        cross_attentions (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
+            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
+            `tf.Tensor` of shape `(batch_size, num_heads, generated_length, sequence_length)`.
+        decoder_hidden_states (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
+            `tf.Tensor` of shape `(batch_size*num_return_sequences, generated_length, hidden_size)`.
+    """
+
+    sequences: tf.Tensor = None
+    scores: Optional[Tuple[tf.Tensor]] = None
+    encoder_attentions: Optional[Tuple[tf.Tensor]] = None
+    encoder_hidden_states: Optional[Tuple[tf.Tensor]] = None
+    decoder_attentions: Optional[Tuple[Tuple[tf.Tensor]]] = None
+    cross_attentions: Optional[Tuple[Tuple[tf.Tensor]]] = None
+    decoder_hidden_states: Optional[Tuple[Tuple[tf.Tensor]]] = None
+
+
+@dataclass
+class TFBeamSearchDecoderOnlyOutput(ModelOutput):
+    """
+    Base class for outputs of decoder-only generation models using beam search.
+
+    Args:
+        sequences (`tf.Tensor` of shape `(batch_size*num_return_sequences, sequence_length)`):
+            The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or shorter
+            if all batches finished early due to the `eos_token_id`.
+        sequences_scores (`tf.Tensor` of shape `(batch_size*num_return_sequences)`, *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
+            Final beam scores of the generated `sequences`.
+        scores (`tuple(tf.Tensor)` *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
+            Processed beam scores for each vocabulary token at each generation step. Beam scores consisting of log
+            softmax scores for each vocabulary token and sum of log softmax of previously generated tokens in this
+            beam. Tuple of `tf.Tensor` with up to `max_new_tokens` elements (one element for each generated token),
+            with each tensor of shape `(batch_size*num_beams*num_return_sequences, config.vocab_size)`.
+        beam_indices (`tf.Tensor`, *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
+            Beam indices of generated token id at each generation step. `tf.Tensor` of shape
+            `(batch_size*num_return_sequences, sequence_length)`.
+        attentions (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
+            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
+            `tf.Tensor` of shape `(batch_size*num_beams, num_heads, generated_length, sequence_length)`.
+        hidden_states (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
+            `tf.Tensor` of shape `(batch_size*num_beams*num_return_sequences, generated_length, hidden_size)`.
+    """
+
+    sequences: tf.Tensor = None
+    sequences_scores: Optional[tf.Tensor] = None
+    scores: Optional[Tuple[tf.Tensor]] = None
+    beam_indices: Optional[tf.Tensor] = None
+    attentions: Optional[Tuple[Tuple[tf.Tensor]]] = None
+    hidden_states: Optional[Tuple[Tuple[tf.Tensor]]] = None
+
+
+@dataclass
+class TFBeamSearchEncoderDecoderOutput(ModelOutput):
+    """
+    Base class for outputs of encoder-decoder generation models using beam search. Hidden states and attention weights
+    of the decoder (respectively the encoder) can be accessed via the encoder_attentions and the encoder_hidden_states
+    attributes (respectively the decoder_attentions and the decoder_hidden_states attributes)
+
+    Args:
+        sequences (`tf.Tensor` of shape `(batch_size*num_return_sequences, sequence_length)`):
+            The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or shorter
+            if all batches finished early due to the `eos_token_id`.
+        sequences_scores (`tf.Tensor` of shape `(batch_size*num_return_sequences)`, *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
+            Final beam scores of the generated `sequences`.
+        scores (`tuple(tf.Tensor)` *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
+            Processed beam scores for each vocabulary token at each generation step. Beam scores consisting of log
+            softmax scores for each vocabulary token and sum of log softmax of previously generated tokens in this
+            beam. `Tuple of `tf.Tensor` with up to `max_new_tokens` elements (one element for each generated token),
+            with each tensor of shape `(batch_size*num_beams, config.vocab_size)`.
+        beam_indices (`tf.Tensor`, *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
+            Beam indices of generated token id at each generation step. `tf.Tensor` of shape
+            `(batch_size*num_return_sequences, sequence_length)`.
+        encoder_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
+            Tuple of `tf.Tensor` (one for each layer of the decoder) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+        encoder_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
+            `(batch_size*num_beams*num_return_sequences, sequence_length, hidden_size)`.
+        decoder_attentions (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
+            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
+            `tf.Tensor` of shape `(batch_size*num_beams*num_return_sequences, num_heads, generated_length,
+            sequence_length)`.
+        cross_attentions (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
+            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
+            `tf.Tensor` of shape `(batch_size, num_heads, generated_length, sequence_length)`.
+        decoder_hidden_states (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
+            `tf.Tensor` of shape `(batch_size*num_beams*num_return_sequences, generated_length, hidden_size)`.
+    """
+
+    sequences: tf.Tensor = None
+    sequences_scores: Optional[tf.Tensor] = None
+    scores: Optional[Tuple[tf.Tensor]] = None
+    beam_indices: Optional[tf.Tensor] = None
+    encoder_attentions: Optional[Tuple[tf.Tensor]] = None
+    encoder_hidden_states: Optional[Tuple[tf.Tensor]] = None
+    decoder_attentions: Optional[Tuple[Tuple[tf.Tensor]]] = None
+    cross_attentions: Optional[Tuple[Tuple[tf.Tensor]]] = None
+    decoder_hidden_states: Optional[Tuple[Tuple[tf.Tensor]]] = None
+
+
+@dataclass
+class TFBeamSampleDecoderOnlyOutput(ModelOutput):
+    """
+    Base class for outputs of decoder-only generation models using beam sample.
+
+    Args:
+        sequences (`tf.Tensor` of shape `(batch_size*num_return_sequences, sequence_length)`):
+            The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or shorter
+            if all batches finished early due to the `eos_token_id`.
+        sequences_scores (`tf.Tensor` of shape `(batch_size * num_return_sequence)`, *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
+            Final beam scores of the generated `sequences`.
+        scores (`tuple(tf.Tensor)` *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
+            Processed beam scores for each vocabulary token at each generation step. Beam scores consisting of log
+            softmax scores for each vocabulary token and sum of log softmax of previously generated tokens in this
+            beam. Tuple of `tf.Tensor` with up to `max_new_tokens` elements (one element for each generated token),
+            with each tensor of shape `(batch_size*num_beams*num_return_sequences, config.vocab_size)`.
+        beam_indices (`tf.Tensor`, *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
+            Beam indices of generated token id at each generation step. `tf.Tensor` of shape
+            `(batch_size*num_return_sequences, sequence_length)`.
+        attentions (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
+            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
+            `tf.Tensor` of shape `(batch_size*num_beams, num_heads, generated_length, sequence_length)`.
+        hidden_states (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
+            `tf.Tensor` of shape `(batch_size*num_beams, generated_length, hidden_size)`.
+    """
+
+    sequences: tf.Tensor = None
+    sequences_scores: Optional[tf.Tensor] = None
+    scores: Optional[Tuple[tf.Tensor]] = None
+    beam_indices: Optional[tf.Tensor] = None
+    attentions: Optional[Tuple[Tuple[tf.Tensor]]] = None
+    hidden_states: Optional[Tuple[Tuple[tf.Tensor]]] = None
+
+
+@dataclass
+class TFBeamSampleEncoderDecoderOutput(ModelOutput):
+    """
+    Base class for outputs of encoder-decoder generation models using beam sampling. Hidden states and attention
+    weights of the decoder (respectively the encoder) can be accessed via the encoder_attentions and the
+    encoder_hidden_states attributes (respectively the decoder_attentions and the decoder_hidden_states attributes)
+
+    Args:
+        sequences (`tf.Tensor` of shape `(batch_size*num_beams, sequence_length)`):
+            The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or shorter
+            if all batches finished early due to the `eos_token_id`.
+        sequences_scores (`tf.Tensor` of shape `(batch_size * num_return_sequence)`, *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
+            Final beam scores of the generated `sequences`.
+        scores (`tuple(tf.Tensor)` *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
+            Processed beam scores for each vocabulary token at each generation step. Beam scores consisting of log
+            softmax scores for each vocabulary token and sum of log softmax of previously generated tokens in this
+            beam. Tuple of `tf.Tensor` with up to `max_new_tokens` elements (one element for each generated token),
+            with each tensor of shape `(batch_size*num_beams, config.vocab_size)`.
+        beam_indices (`tf.Tensor`, *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
+            Beam indices of generated token id at each generation step. `tf.Tensor` of shape
+            `(batch_size*num_return_sequences, sequence_length)`.
+        encoder_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
+            Tuple of `tf.Tensor` (one for each layer of the decoder) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+        encoder_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
+            `(batch_size*num_beams, sequence_length, hidden_size)`.
+        decoder_attentions (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
+            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
+            `tf.Tensor` of shape `(batch_size*num_beams, num_heads, generated_length, sequence_length)`.
+        cross_attentions (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
+            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
+            `tf.Tensor` of shape `(batch_size, num_heads, generated_length, sequence_length)`.
+        decoder_hidden_states (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
+            `tf.Tensor` of shape `(batch_size*num_beams, generated_length, hidden_size)`.
+    """
+
+    sequences: tf.Tensor = None
+    sequences_scores: Optional[tf.Tensor] = None
+    scores: Optional[Tuple[tf.Tensor]] = None
+    beam_indices: Optional[tf.Tensor] = None
+    encoder_attentions: Optional[Tuple[tf.Tensor]] = None
+    encoder_hidden_states: Optional[Tuple[tf.Tensor]] = None
+    decoder_attentions: Optional[Tuple[Tuple[tf.Tensor]]] = None
+    cross_attentions: Optional[Tuple[Tuple[tf.Tensor]]] = None
+    decoder_hidden_states: Optional[Tuple[Tuple[tf.Tensor]]] = None
+
+
+@dataclass
+class TFContrastiveSearchDecoderOnlyOutput(ModelOutput):
+    """
+    Base class for outputs of decoder-only generation models using contrastive search.
+
+    Args:
+        sequences (`tf.Tensor` of shape `(batch_size, sequence_length)`):
+            The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or shorter
+            if all batches finished early due to the `eos_token_id`.
+        scores (`tuple(tf.Tensor)` *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
+            Processed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax)
+            at each generation step. Tuple of `tf.Tensor` with up to `max_new_tokens` elements (one element for each
+            generated token), with each tensor of shape `(batch_size, config.vocab_size)`.
+        attentions (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
+            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
+            `tf.Tensor` of shape `(batch_size, num_heads, generated_length, sequence_length)`.
+        hidden_states (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
+            `tf.Tensor` of shape `(batch_size, generated_length, hidden_size)`.
+    """
+
+    sequences: tf.Tensor = None
+    scores: Optional[Tuple[tf.Tensor]] = None
+    attentions: Optional[Tuple[Tuple[tf.Tensor]]] = None
+    hidden_states: Optional[Tuple[Tuple[tf.Tensor]]] = None
+
+
+@dataclass
+class TFContrastiveSearchEncoderDecoderOutput(ModelOutput):
+    """
+    Base class for outputs of encoder-decoder generation models using contrastive search. Hidden states and attention
+    weights of the decoder (respectively the encoder) can be accessed via the encoder_attentions and the
+    encoder_hidden_states attributes (respectively the decoder_attentions and the decoder_hidden_states attributes)
+
+    Args:
+        sequences (`tf.Tensor` of shape `(batch_size, sequence_length)`):
+            The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or shorter
+            if all batches finished early due to the `eos_token_id`.
+        scores (`tuple(tf.Tensor)` *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
+            Processed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax)
+            at each generation step. Tuple of `tf.Tensor` with up to `max_new_tokens` elements (one element for each
+            generated token), with each tensor of shape `(batch_size, config.vocab_size)`.
+        encoder_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
+            Tuple of `tf.Tensor` (one for each layer of the decoder) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+        encoder_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
+            `(batch_size, sequence_length, hidden_size)`.
+        decoder_attentions (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
+            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
+            `tf.Tensor` of shape `(batch_size, num_heads, generated_length, sequence_length)`.
+        cross_attentions (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
+            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
+            `tf.Tensor` of shape `(batch_size, num_heads, generated_length, sequence_length)`.
+        decoder_hidden_states (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
+            `tf.Tensor` of shape `(batch_size, generated_length, hidden_size)`.
+    """
+
+    sequences: tf.Tensor = None
+    scores: Optional[Tuple[tf.Tensor]] = None
+    encoder_attentions: Optional[Tuple[tf.Tensor]] = None
+    encoder_hidden_states: Optional[Tuple[tf.Tensor]] = None
+    decoder_attentions: Optional[Tuple[Tuple[tf.Tensor]]] = None
+    cross_attentions: Optional[Tuple[Tuple[tf.Tensor]]] = None
+    decoder_hidden_states: Optional[Tuple[Tuple[tf.Tensor]]] = None
+
+
+TFGreedySearchOutput = Union[TFGreedySearchEncoderDecoderOutput, TFGreedySearchDecoderOnlyOutput]
+TFSampleOutput = Union[TFSampleEncoderDecoderOutput, TFSampleDecoderOnlyOutput]
+TFBeamSearchOutput = Union[TFBeamSearchEncoderDecoderOutput, TFBeamSearchDecoderOnlyOutput]
+TFBeamSampleOutput = Union[TFBeamSampleEncoderDecoderOutput, TFBeamSampleDecoderOnlyOutput]
+TFContrastiveSearchOutput = Union[TFContrastiveSearchEncoderDecoderOutput, TFContrastiveSearchDecoderOnlyOutput]
+TFGenerateOutput = Union[
+    TFGreedySearchOutput, TFSampleOutput, TFBeamSearchOutput, TFBeamSampleOutput, TFContrastiveSearchOutput
+]
+
+
+class TFGenerationMixin:
+    """
+    A class containing all of the functions supporting generation, to be used as a mixin in [`TFPreTrainedModel`].
+
+    The class exposes [`~generation.TFGenerationMixin.generate`], which can be used for:
+        - *greedy decoding* by calling [`~generation.TFGenerationMixin.greedy_search`] if `num_beams=1` and
+          `do_sample=False`
+        - *contrastive search* by calling [`~generation.TFGenerationMixin.contrastive_search`] if `penalty_alpha>0` and
+          `top_k>1`
+        - *multinomial sampling* by calling [`~generation.TFGenerationMixin.sample`] if `num_beams=1` and
+          `do_sample=True`
+        - *beam-search decoding* by calling [`~generation.TFGenerationMixin.beam_search`] if `num_beams>1`
+
+    You do not need to call any of the above methods directly. Pass custom parameter values to 'generate' instead. To
+    learn more about decoding strategies refer to the [text generation strategies guide](../generation_strategies).
+    """
+
+    _seed_generator = None
+
+    @property
+    def seed_generator(self):
+        warnings.warn("`seed_generator` is deprecated and will be removed in a future version.", UserWarning)
+        if self._seed_generator is None:
+            self._seed_generator = tf.random.Generator.from_non_deterministic_state()
+        return self._seed_generator
+
+    supports_xla_generation = True
+
+    def prepare_inputs_for_generation(self, *args, **kwargs):
+        raise NotImplementedError(
+            "A model class needs to define a `prepare_inputs_for_generation` method in order to use `generate`."
+        )
+
+    def compute_transition_scores(
+        self,
+        sequences: tf.Tensor,
+        scores: Tuple[tf.Tensor],
+        beam_indices: Optional[tf.Tensor] = None,
+        normalize_logits: bool = False,
+    ) -> tf.Tensor:
+        """
+        Computes the transition scores of sequences given the generation scores (and beam indices, if beam search was
+        used). This is a convenient method to quicky obtain the scores of the selected tokens at generation time.
+
+        Parameters:
+            sequences (`tf.Tensor`):
+                The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or
+                shorter if all batches finished early due to the `eos_token_id`.
+            scores (`tuple(tf.Tensor)`):
+                Transition scores for each vocabulary token at each generation step. Beam transition scores consisting
+                of log probabilities of tokens conditioned on log softmax of previously generated tokens Tuple of
+                `tf.Tensor` with up to `max_new_tokens` elements (one element for each generated token), with each
+                tensor of shape `(batch_size*num_beams, config.vocab_size)`.
+            beam_indices (`tf.Tensor`, *optional*):
+                Beam indices of generated token id at each generation step. `tf.Tensor` of shape
+                `(batch_size*num_return_sequences, sequence_length)`. Only required if a `num_beams>1` at
+                generate-time.
+            normalize_logits (`bool`, *optional*, defaults to `False`):
+                Whether to normalize the logits (which, for legacy reasons, may be unnormalized).
+
+        Return:
+            `tf.Tensor`: A `tf.Tensor` of shape `(batch_size*num_return_sequences, sequence_length)` containing
+                the transition scores (logits)
+
+        Examples:
+
+        ```python
+        >>> from transformers import GPT2Tokenizer, TFAutoModelForCausalLM
+        >>> import numpy as np
+
+        >>> tokenizer = GPT2Tokenizer.from_pretrained("openai-community/gpt2")
+        >>> model = TFAutoModelForCausalLM.from_pretrained("openai-community/gpt2")
+        >>> tokenizer.pad_token_id = tokenizer.eos_token_id
+        >>> inputs = tokenizer(["Today is"], return_tensors="tf")
+
+        >>> # Example 1: Print the scores for each token generated with Greedy Search
+        >>> outputs = model.generate(**inputs, max_new_tokens=5, return_dict_in_generate=True, output_scores=True)
+        >>> transition_scores = model.compute_transition_scores(
+        ...     outputs.sequences, outputs.scores, normalize_logits=True
+        ... )
+        >>> # input_length is the length of the input prompt for decoder-only models, like the GPT family, and 1 for
+        >>> # encoder-decoder models, like BART or T5.
+        >>> input_length = 1 if model.config.is_encoder_decoder else inputs.input_ids.shape[1]
+        >>> generated_tokens = outputs.sequences[:, input_length:]
+        >>> for tok, score in zip(generated_tokens[0], transition_scores[0]):
+        ...     # | token | token string | logits | probability
+        ...     print(f"| {tok:5d} | {tokenizer.decode(tok):8s} | {score.numpy():.3f} | {np.exp(score.numpy()):.2%}")
+        |   262 |  the     | -1.414 | 24.33%
+        |  1110 |  day     | -2.609 | 7.36%
+        |   618 |  when    | -2.010 | 13.40%
+        |   356 |  we      | -1.859 | 15.58%
+        |   460 |  can     | -2.508 | 8.14%
+
+        >>> # Example 2: Reconstruct the sequence scores from Beam Search
+        >>> outputs = model.generate(
+        ...     **inputs,
+        ...     max_new_tokens=5,
+        ...     num_beams=4,
+        ...     num_return_sequences=4,
+        ...     return_dict_in_generate=True,
+        ...     output_scores=True,
+        ... )
+        >>> transition_scores = model.compute_transition_scores(
+        ...     outputs.sequences, outputs.scores, outputs.beam_indices, normalize_logits=False
+        ... )
+        >>> # If you sum the generated tokens' scores and apply the length penalty, you'll get the sequence scores.
+        >>> # Tip: recomputing the scores is only guaranteed to match with `normalize_logits=False`. Depending on the
+        >>> # use case, you might want to recompute it with `normalize_logits=True`.
+        >>> output_length = np.sum(transition_scores.numpy() < 0, axis=1)
+        >>> length_penalty = model.generation_config.length_penalty
+        >>> reconstructed_scores = np.sum(transition_scores, axis=1) / (output_length**length_penalty)
+        >>> print(np.allclose(outputs.sequences_scores, reconstructed_scores))
+        True
+        ```"""
+        # 1. In absence of `beam_indices`, we can assume that we come from e.g. greedy search, which is equivalent
+        # to a beam search approach were the first (and only) beam is always selected
+        if beam_indices is None:
+            beam_indices = tf.tile(tf.expand_dims(tf.range(scores[0].shape[0]), axis=1), [1, len(scores)])
+
+        # 2. reshape scores as [batch_size, vocab_size, # generation steps] with # generation steps being
+        # seq_len - input_length
+        scores = tf.transpose(tf.reshape(tf.stack(scores), (len(scores), -1)), (1, 0))
+        scores = tf.reshape(scores, (-1, self.config.vocab_size, scores.shape[-1]))
+
+        # 3. Optionally normalize the logits (across the vocab dimension)
+        if normalize_logits:
+            scores = tf.nn.log_softmax(scores, axis=1)
+
+        # 4. cut beam_indices to longest beam length
+        beam_indices_mask = beam_indices < 0
+        max_beam_length = tf.math.reduce_max(
+            tf.math.reduce_sum((1 - tf.cast(beam_indices_mask, dtype=tf.int32)), axis=-1)
+        )
+        beam_indices = beam_indices[:, -max_beam_length:]
+        beam_indices_mask = beam_indices_mask[:, -max_beam_length:]
+
+        # 5. Set indices of beams that finished early to 0; such indices will be masked correctly afterwards
+        beam_indices = tf.where(beam_indices_mask, 0, beam_indices)
+
+        # 6. Define which indices contributed to scores
+        cut_idx = sequences.shape[-1] - max_beam_length
+        token_indices = sequences[:, cut_idx:]
+        gen_step_idx = tf.broadcast_to(tf.range(scores.shape[-1]), token_indices.shape)
+        indices = tf.stack([beam_indices, token_indices, gen_step_idx], axis=-1)
+
+        # 7. Compute scores
+        transition_scores = tf.gather_nd(scores, indices)
+
+        # 8. Mask out transition_scores of beams that stopped early
+        transition_scores = tf.where(beam_indices_mask, 0, transition_scores)
+
+        return transition_scores
+
+    def _validate_model_class(self):
+        """
+        Confirms that the model class is compatible with generation. If not, raises an exception that points to the
+        right class to use.
+        """
+        if not self.can_generate():
+            generate_compatible_mappings = [
+                TF_MODEL_FOR_CAUSAL_LM_MAPPING,
+                TF_MODEL_FOR_VISION_2_SEQ_MAPPING,
+                TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
+                TF_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING,
+            ]
+            generate_compatible_classes = set()
+            for model_mapping in generate_compatible_mappings:
+                supported_models = model_mapping.get(type(self.config), default=None)
+                if supported_models is not None:
+                    generate_compatible_classes.add(supported_models.__name__)
+            exception_message = (
+                f"The current model class ({self.__class__.__name__}) is not compatible with `.generate()`, as "
+                "it doesn't have a language model head."
+            )
+            if generate_compatible_classes:
+                exception_message += f" Please use one of the following classes instead: {generate_compatible_classes}"
+            raise TypeError(exception_message)
+
+    def _validate_model_kwargs(self, model_kwargs: Dict[str, Any]):
+        """Validates model kwargs for generation. Generate argument typos will also be caught here."""
+        # Excludes arguments that are handled before calling any model function
+        if self.config.is_encoder_decoder:
+            for key in ["decoder_input_ids"]:
+                model_kwargs.pop(key, None)
+
+        unused_model_args = []
+        model_args = set(inspect.signature(self.prepare_inputs_for_generation).parameters)
+        # `kwargs`/`model_kwargs` is often used to handle optional forward pass inputs like `attention_mask`. If
+        # `prepare_inputs_for_generation` doesn't accept them, then a stricter check can be made ;)
+        if "kwargs" in model_args or "model_kwargs" in model_args:
+            model_args |= set(inspect.signature(self.call).parameters)
+        for key, value in model_kwargs.items():
+            if value is not None and key not in model_args:
+                unused_model_args.append(key)
+
+        if unused_model_args:
+            raise ValueError(
+                f"The following `model_kwargs` are not used by the model: {unused_model_args} (note: typos in the"
+                " generate arguments will also show up in this list)"
+            )
+
+    def generate(
+        self,
+        inputs: Optional[tf.Tensor] = None,
+        generation_config: Optional[GenerationConfig] = None,
+        logits_processor: Optional[TFLogitsProcessorList] = None,
+        seed=None,
+        **kwargs,
+    ) -> Union[TFGenerateOutput, tf.Tensor]:
+        r"""
+        Generates sequences of token ids for models with a language modeling head.
+
+        <Tip warning={true}>
+
+        Most generation-controlling parameters are set in `generation_config` which, if not passed, will be set to the
+        model's default generation configuration. You can override any `generation_config` by passing the corresponding
+        parameters to generate, e.g. `.generate(inputs, num_beams=4, do_sample=True)`.
+
+        For an overview of generation strategies and code examples, check out the [following
+        guide](../generation_strategies).
+
+        </Tip>
+
+        Parameters:
+            inputs (`tf.Tensor` of varying shape depending on the modality, *optional*):
+                The sequence used as a prompt for the generation or as model inputs to the encoder. If `None` the
+                method initializes it with `bos_token_id` and a batch size of 1. For decoder-only models `inputs`
+                should of in the format of `input_ids`. For encoder-decoder models *inputs* can represent any of
+                `input_ids`, `input_values`, `input_features`, or `pixel_values`.
+            generation_config (`~generation.GenerationConfig`, *optional*):
+                The generation configuration to be used as base parametrization for the generation call. `**kwargs`
+                passed to generate matching the attributes of `generation_config` will override them. If
+                `generation_config` is not provided, the default will be used, which had the following loading
+                priority: 1) from the `generation_config.json` model file, if it exists; 2) from the model
+                configuration. Please note that unspecified parameters will inherit [`~generation.GenerationConfig`]'s
+                default values, whose documentation should be checked to parameterize generation.
+            logits_processor (`LogitsProcessorList`, *optional*):
+                Custom logits processors that complement the default logits processors built from arguments and
+                generation config. If a logit processor is passed that is already created with the arguments or a
+                generation config an error is thrown. This feature is intended for advanced users.
+            seed (`List[int]`, *optional*):
+                Random seed to control sampling, containing two integers, used when `do_sample` is `True`. See the
+                `seed` argument from stateless functions in `tf.random`.
+            kwargs (`Dict[str, Any]`, *optional*):
+                Ad hoc parametrization of `generate_config` and/or additional model-specific kwargs that will be
+                forwarded to the `forward` function of the model. If the model is an encoder-decoder model, encoder
+                specific kwargs should not be prefixed and decoder specific kwargs should be prefixed with *decoder_*.
+
+        Return:
+            [`~utils.ModelOutput`] or `tf.Tensor`: A [`~utils.ModelOutput`] (if `return_dict_in_generate=True` or when
+            `config.return_dict_in_generate=True`) or a `tf.Tensor`.
+
+                If the model is *not* an encoder-decoder model (`model.config.is_encoder_decoder=False`), the possible
+                [`~utils.ModelOutput`] types are:
+
+                    - [`~generation.TFGreedySearchDecoderOnlyOutput`],
+                    - [`~generation.TFSampleDecoderOnlyOutput`],
+                    - [`~generation.TFBeamSearchDecoderOnlyOutput`],
+                    - [`~generation.TFBeamSampleDecoderOnlyOutput`]
+
+                If the model is an encoder-decoder model (`model.config.is_encoder_decoder=True`), the possible
+                [`~utils.ModelOutput`] types are:
+
+                    - [`~generation.TFGreedySearchEncoderDecoderOutput`],
+                    - [`~generation.TFSampleEncoderDecoderOutput`],
+                    - [`~generation.TFBeamSearchEncoderDecoderOutput`],
+                    - [`~generation.TFBeamSampleEncoderDecoderOutput`]
+
+        """
+
+        # 1. Handle `generation_config` and kwargs that might update it, and validate the `.generate()` call
+        self._validate_model_class()
+
+        # priority: `generation_config` argument > `model.generation_config` (the default generation config)
+        if generation_config is None:
+            # legacy: users may modify the model configuration to control generation. To trigger this legacy behavior,
+            # two conditions must be met
+            # 1) the generation config must have been created from the model config (`_from_model_config` field);
+            # 2) the generation config must have seen no modification since its creation (the hash is the same).
+            if self.generation_config._from_model_config and self.generation_config._original_object_hash == hash(
+                self.generation_config
+            ):
+                new_generation_config = GenerationConfig.from_model_config(self.config)
+                if new_generation_config != self.generation_config:
+                    warnings.warn(
+                        "You have modified the pretrained model configuration to control generation. This is a"
+                        " deprecated strategy to control generation and will be removed soon, in a future version."
+                        " Please use and modify the model generation configuration (see"
+                        " https://huggingface.co/docs/transformers/generation_strategies#default-text-generation-configuration )"
+                    )
+                    self.generation_config = new_generation_config
+            generation_config = self.generation_config
+
+        generation_config = copy.deepcopy(generation_config)
+        model_kwargs = generation_config.update(**kwargs)  # All unused kwargs must be model kwargs
+        self._validate_model_kwargs(model_kwargs.copy())
+
+        # 2. Cast input dtypes to tf.int32 unless they're floats (which happens for some image models)
+        if inputs is not None:
+            if isinstance(inputs, tf.Tensor) and inputs.dtype.is_floating:
+                pass
+            elif isinstance(inputs, np.ndarray) and np.issubdtype(inputs.dtype, np.floating):
+                pass
+            else:
+                inputs = tf.cast(inputs, tf.int32)
+        if model_kwargs.get("attention_mask") is not None:
+            model_kwargs["attention_mask"] = tf.cast(model_kwargs["attention_mask"], tf.int32)
+        if "decoder_input_ids" in model_kwargs:
+            if (
+                isinstance(model_kwargs["decoder_input_ids"], tf.Tensor)
+                and model_kwargs["decoder_input_ids"].dtype.is_floating
+            ):
+                pass
+            elif isinstance(model_kwargs["decoder_input_ids"], np.ndarray) and np.issubdtype(
+                model_kwargs["decoder_input_ids"].dtype, np.floating
+            ):
+                pass
+            else:
+                model_kwargs["decoder_input_ids"] = tf.cast(model_kwargs["decoder_input_ids"], tf.int32)
+
+        # 3. Set generation parameters if not already defined
+        logits_processor = logits_processor if logits_processor is not None else TFLogitsProcessorList()
+
+        if generation_config.pad_token_id is None and generation_config.eos_token_id is not None:
+            if model_kwargs.get("attention_mask") is None:
+                logger.warning(
+                    "The attention mask and the pad token id were not set. As a consequence, you may observe "
+                    "unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results."
+                )
+            eos_token_id = generation_config.eos_token_id
+            if isinstance(eos_token_id, list):
+                eos_token_id = eos_token_id[0]
+            generation_config.pad_token_id = eos_token_id
+
+        use_xla = not tf.executing_eagerly()
+        if use_xla and not self.supports_xla_generation:
+            raise ValueError(
+                "The selected model does not support Graph mode nor XLA generation (e.g. from tf.function())"
+            )
+
+        # 4. Define model inputs
+        inputs_tensor, model_input_name, model_kwargs = self._prepare_model_inputs(
+            inputs, generation_config.bos_token_id, model_kwargs
+        )
+        # inputs_ids now has to be defined and cannot be None anymore
+        batch_size = shape_list(inputs_tensor)[0]
+
+        # 5. Prepare other model kwargs
+        model_kwargs["output_attentions"] = generation_config.output_attentions
+        model_kwargs["output_hidden_states"] = generation_config.output_hidden_states
+        model_kwargs["use_cache"] = generation_config.use_cache
+
+        accepts_attention_mask = "attention_mask" in set(inspect.signature(self.call).parameters.keys())
+        requires_attention_mask = "encoder_outputs" not in model_kwargs
+
+        if model_kwargs.get("attention_mask", None) is None and requires_attention_mask and accepts_attention_mask:
+            model_kwargs["attention_mask"] = self._prepare_attention_mask_for_generation(
+                inputs_tensor, generation_config.pad_token_id, generation_config.eos_token_id
+            )
+
+        # decoder-only models should use left-padding for generation
+        if not self.config.is_encoder_decoder:
+            if generation_config.pad_token_id is not None and tf.math.reduce_any(
+                inputs_tensor[:, -1] == generation_config.pad_token_id
+            ):
+                logger.warning(
+                    "A decoder-only architecture is being used, but right-padding was detected! For correct "
+                    "generation results, please set `padding_side='left'` when initializing the tokenizer."
+                )
+        if self.config.is_encoder_decoder and "encoder_outputs" not in model_kwargs:
+            # if model is encoder decoder encoder_outputs are created and added to `model_kwargs`
+            model_kwargs = self._prepare_encoder_decoder_kwargs_for_generation(
+                inputs_tensor, model_kwargs, model_input_name
+            )
+
+        # 6. Prepare model inputs which will be used for auto-regressive generation
+        if self.config.is_encoder_decoder:
+            input_ids, model_kwargs = self._prepare_decoder_input_ids_for_generation(
+                batch_size=batch_size,
+                model_input_name=model_input_name,
+                model_kwargs=model_kwargs,
+                decoder_start_token_id=generation_config.decoder_start_token_id,
+                bos_token_id=generation_config.bos_token_id,
+            )
+        else:
+            input_ids = inputs_tensor if model_input_name == "input_ids" else model_kwargs.pop("input_ids")
+
+        # 7. Prepare `max_length` depending on other stopping criteria.
+        input_ids_seq_length = shape_list(input_ids)[-1]
+        has_default_max_length = kwargs.get("max_length") is None and generation_config.max_length is not None
+        if has_default_max_length and generation_config.max_new_tokens is None and generation_config.max_length == 20:
+            # 20 is the default max_length of the generation config
+            warnings.warn(
+                f"Using the model-agnostic default `max_length` (={generation_config.max_length}) "
+                "to control the generation length.  recommend setting `max_new_tokens` to control the maximum length of the generation.",
+                UserWarning,
+            )
+        elif generation_config.max_new_tokens is not None:
+            if not has_default_max_length and generation_config.max_length is not None:
+                logger.warning(
+                    f"Both `max_new_tokens` (={generation_config.max_new_tokens}) and `max_length`(="
+                    f"{generation_config.max_length}) seem to have been set. `max_new_tokens` will take precedence. "
+                    "Please refer to the documentation for more information. "
+                    "(https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)"
+                )
+            generation_config.max_length = generation_config.max_new_tokens + input_ids_seq_length
+
+        # If the input length is a tensor (i.e. dynamic length), skip length checks
+        if not isinstance(input_ids_seq_length, tf.Tensor):
+            if (
+                generation_config.min_length is not None
+                and generation_config.min_length > generation_config.max_length
+            ):
+                raise ValueError(
+                    f"Unfeasable length constraints: the minimum length ({generation_config.min_length}) is larger"
+                    f" than the maximum length ({generation_config.max_length})"
+                )
+            if input_ids_seq_length >= generation_config.max_length:
+                input_ids_string = "decoder_input_ids" if self.config.is_encoder_decoder else "input_ids"
+                logger.warning(
+                    f"Input length of {input_ids_string} is {input_ids_seq_length}, but `max_length` is set to"
+                    f" {generation_config.max_length}. This can lead to unexpected behavior. You should consider"
+                    " increasing`max_new_tokens`."
+                )
+
+        # 8. determine generation mode
+        is_contrastive_search_gen_mode = (
+            generation_config.top_k is not None
+            and generation_config.top_k > 1
+            and generation_config.do_sample is False
+            and generation_config.penalty_alpha is not None
+            and generation_config.penalty_alpha > 0
+        )
+        is_greedy_gen_mode = (
+            not is_contrastive_search_gen_mode
+            and (generation_config.num_beams == 1)
+            and generation_config.do_sample is False
+        )
+        is_beam_gen_mode = (
+            not is_contrastive_search_gen_mode
+            and (generation_config.num_beams > 1)
+            and generation_config.do_sample is False
+        )
+        is_sample_gen_mode = (generation_config.num_beams == 1) and generation_config.do_sample is True
+        is_beam_sample_gen_mode = (generation_config.num_beams > 1) and generation_config.do_sample is True
+
+        # 9. prepare distribution pre_processing samplers
+        logits_processor = self._get_logits_processor(
+            generation_config=generation_config,
+            input_ids_seq_length=input_ids_seq_length,
+            logits_processor=logits_processor,
+        )
+
+        # 10. go into different generation modes
+        if is_greedy_gen_mode:
+            if generation_config.num_return_sequences > 1:
+                raise ValueError(
+                    f"num_return_sequences has to be 1, but is {generation_config.num_return_sequences} when doing"
+                    " greedy search."
+                )
+            # 11. run greedy search
+            return self.greedy_search(
+                input_ids,
+                max_length=generation_config.max_length,
+                pad_token_id=generation_config.pad_token_id,
+                eos_token_id=generation_config.eos_token_id,
+                logits_processor=logits_processor,
+                output_scores=generation_config.output_scores,
+                return_dict_in_generate=generation_config.return_dict_in_generate,
+                **model_kwargs,
+            )
+        elif is_contrastive_search_gen_mode:
+            if generation_config.num_return_sequences > 1:
+                raise ValueError(
+                    f"num_return_sequences has to be 1, but is {generation_config.num_return_sequences} when doing"
+                    " contrastive search."
+                )
+            # 11. run contrastive search
+            return self.contrastive_search(
+                input_ids,
+                top_k=generation_config.top_k,
+                penalty_alpha=generation_config.penalty_alpha,
+                logits_processor=logits_processor,
+                max_length=generation_config.max_length,
+                pad_token_id=generation_config.pad_token_id,
+                eos_token_id=generation_config.eos_token_id,
+                output_scores=generation_config.output_scores,
+                return_dict_in_generate=generation_config.return_dict_in_generate,
+                **model_kwargs,
+            )
+        elif is_sample_gen_mode:
+            # 11. prepare logits warper
+            logits_warper = self._get_logits_warper(generation_config=generation_config)
+
+            # 12. expand input_ids with `num_return_sequences` additional sequences per batch
+            input_ids, model_kwargs = self._expand_inputs_for_generation(
+                input_ids=input_ids,
+                expand_size=generation_config.num_return_sequences,
+                is_encoder_decoder=self.config.is_encoder_decoder,
+                **model_kwargs,
+            )
+
+            # 13. run sample
+            return self.sample(
+                input_ids,
+                logits_processor=logits_processor,
+                logits_warper=logits_warper,
+                max_length=generation_config.max_length,
+                pad_token_id=generation_config.pad_token_id,
+                eos_token_id=generation_config.eos_token_id,
+                seed=seed,
+                output_scores=generation_config.output_scores,
+                return_dict_in_generate=generation_config.return_dict_in_generate,
+                **model_kwargs,
+            )
+
+        elif is_beam_gen_mode:
+            if generation_config.num_beams < generation_config.num_return_sequences:
+                raise ValueError(
+                    "Beam search decoding cannot return more sequences than it has beams. Please set num_beams >="
+                    f" num_return_sequences, got {generation_config.num_beams} and"
+                    f" {generation_config.num_return_sequences} (respectivelly)"
+                )
+
+            # 11. broadcast inputs to the desired number of beams
+            input_ids, model_kwargs = self._expand_inputs_for_generation(
+                input_ids=input_ids,
+                expand_size=generation_config.num_beams,
+                is_encoder_decoder=self.config.is_encoder_decoder,
+                expand_in_new_axis=True,
+                **model_kwargs,
+            )
+
+            # 12. run beam search
+            return self.beam_search(
+                input_ids,
+                max_length=generation_config.max_length,
+                pad_token_id=generation_config.pad_token_id,
+                eos_token_id=generation_config.eos_token_id,
+                length_penalty=generation_config.length_penalty,
+                early_stopping=generation_config.early_stopping,
+                logits_processor=logits_processor,
+                output_scores=generation_config.output_scores,
+                return_dict_in_generate=generation_config.return_dict_in_generate,
+                num_return_sequences=generation_config.num_return_sequences,
+                **model_kwargs,
+            )
+
+        elif is_beam_sample_gen_mode:
+            if generation_config.num_beams < generation_config.num_return_sequences:
+                raise ValueError(
+                    "Beam search decoding cannot return more sequences than it has beams. Please set num_beams >="
+                    f" num_return_sequences, got {generation_config.num_beams} and"
+                    f" {generation_config.num_return_sequences} (respectivelly)"
+                )
+
+            # 11. prepare logits warper
+            logits_warper = self._get_logits_warper(generation_config=generation_config)
+
+            # 12. broadcast inputs to the desired number of beams
+            input_ids, model_kwargs = self._expand_inputs_for_generation(
+                input_ids=input_ids,
+                expand_size=generation_config.num_beams,
+                is_encoder_decoder=self.config.is_encoder_decoder,
+                expand_in_new_axis=True,
+                **model_kwargs,
+            )
+
+            # 13. run beam sample (beam search with sampling)
+            return self.beam_search(
+                input_ids,
+                do_sample=True,
+                max_length=generation_config.max_length,
+                pad_token_id=generation_config.pad_token_id,
+                eos_token_id=generation_config.eos_token_id,
+                length_penalty=generation_config.length_penalty,
+                early_stopping=generation_config.early_stopping,
+                logits_processor=logits_processor,
+                logits_warper=logits_warper,
+                output_scores=generation_config.output_scores,
+                return_dict_in_generate=generation_config.return_dict_in_generate,
+                num_return_sequences=generation_config.num_return_sequences,
+                **model_kwargs,
+            )
+
+    def _prepare_attention_mask_for_generation(
+        self,
+        inputs: tf.Tensor,
+        pad_token_id: Optional[int],
+        eos_token_id: Optional[int],
+    ) -> tf.Tensor:
+        is_input_ids = len(inputs.shape) == 2 and inputs.dtype in (tf.int32, tf.int64)
+        is_pad_token_in_inputs = (pad_token_id is not None) and tf.math.reduce_any(inputs == pad_token_id)
+        is_pad_token_not_equal_to_eos_token_id = (eos_token_id is None) or (pad_token_id != eos_token_id)
+
+        # Check if input is input_ids and padded -> only then is attention_mask defined
+        if is_input_ids and is_pad_token_in_inputs and is_pad_token_not_equal_to_eos_token_id:
+            return tf.cast(tf.math.not_equal(inputs, pad_token_id), dtype=tf.int32)
+        else:
+            return tf.ones(inputs.shape[:2], dtype=tf.int32)
+
+    def _prepare_encoder_decoder_kwargs_for_generation(
+        self, inputs_tensor: tf.Tensor, model_kwargs, model_input_name: Optional[str] = None
+    ) -> Dict[str, Any]:
+        # 1. get encoder and store encoder outputs
+        encoder = self.get_encoder()
+
+        # 2. prepare encoder args and encoder kwargs from model kwargs
+        irrelevant_prefix = ["decoder_", "cross_attn", "use_cache"]
+        encoder_kwargs = {
+            argument: value
+            for argument, value in model_kwargs.items()
+            if not any(argument.startswith(p) for p in irrelevant_prefix)
+        }
+        encoder_signature = set(inspect.signature(encoder.call).parameters)
+        encoder_accepts_wildcard = "kwargs" in encoder_signature or "model_kwargs" in encoder_signature
+        if not encoder_accepts_wildcard:
+            encoder_kwargs = {
+                argument: value for argument, value in encoder_kwargs.items() if argument in encoder_signature
+            }
+
+        # 3. vision models don't use `attention_mask`.
+        encoder_kwargs["return_dict"] = True
+        encoder_kwargs[model_input_name] = inputs_tensor
+        if model_input_name != self.main_input_name:  # in Keras, the first input must always be passed
+            encoder_kwargs[self.main_input_name] = None
+        encoder_outputs = encoder(**encoder_kwargs)
+        model_kwargs["encoder_outputs"] = encoder_outputs
+
+        return model_kwargs
+
+    def _prepare_decoder_input_ids_for_generation(
+        self,
+        batch_size: int,
+        model_input_name: str,
+        model_kwargs: Dict[str, tf.Tensor],
+        decoder_start_token_id: int = None,
+        bos_token_id: int = None,
+    ) -> Tuple[tf.Tensor, Dict[str, tf.Tensor]]:
+        """Prepares `decoder_input_ids` for generation with encoder-decoder models"""
+        # 1. Check whether the user has defined `decoder_input_ids` manually. To facilitate in terms of input naming,
+        # we also allow the user to pass it under `input_ids`, if the encoder does not use it as the main input.
+        if model_kwargs is not None and "decoder_input_ids" in model_kwargs:
+            decoder_input_ids = model_kwargs.pop("decoder_input_ids")
+        elif "input_ids" in model_kwargs and model_input_name != "input_ids":
+            decoder_input_ids = model_kwargs.pop("input_ids")
+        else:
+            decoder_input_ids = None
+
+        # 2. Encoder-decoder models expect the `decoder_input_ids` to start with a special token. Let's ensure that.
+        decoder_start_token_id = self._get_decoder_start_token_id(decoder_start_token_id, bos_token_id)
+        decoder_input_ids_start = tf.ones((batch_size, 1), dtype=tf.int32) * decoder_start_token_id
+
+        # no user input -> use decoder_start_token_id as decoder_input_ids
+        if decoder_input_ids is None:
+            decoder_input_ids = decoder_input_ids_start
+        # user input but doesn't start with decoder_start_token_id -> prepend decoder_start_token_id (and adjust
+        # decoder_attention_mask if provided)
+        elif tf.reduce_all(decoder_input_ids[:, 0] != decoder_start_token_id):
+            decoder_input_ids = tf.concat([decoder_input_ids_start, decoder_input_ids], axis=-1)
+            if "decoder_attention_mask" in model_kwargs:
+                decoder_attention_mask = model_kwargs["decoder_attention_mask"]
+                decoder_attention_mask = tf.concat(
+                    (tf.ones_like(decoder_attention_mask)[:, :1], decoder_attention_mask),
+                    axis=-1,
+                )
+                model_kwargs["decoder_attention_mask"] = decoder_attention_mask
+
+        return decoder_input_ids, model_kwargs
+
+    def _get_decoder_start_token_id(self, decoder_start_token_id: int = None, bos_token_id: int = None) -> int:
+        # retrieve decoder_start_token_id for encoder-decoder models
+        # fall back to bos_token_id if necessary
+        decoder_start_token_id = (
+            decoder_start_token_id
+            if decoder_start_token_id is not None
+            else self.generation_config.decoder_start_token_id
+        )
+        bos_token_id = bos_token_id if bos_token_id is not None else self.generation_config.bos_token_id
+
+        if decoder_start_token_id is not None:
+            return decoder_start_token_id
+        elif bos_token_id is not None:
+            return bos_token_id
+        raise ValueError(
+            "`decoder_start_token_id` or `bos_token_id` has to be defined for encoder-decoder generation."
+        )
+
+    @staticmethod
+    def _expand_inputs_for_generation(
+        expand_size: int = 1,
+        is_encoder_decoder: bool = False,
+        input_ids: Optional[tf.Tensor] = None,
+        expand_in_new_axis: bool = False,
+        **model_kwargs,
+    ) -> Tuple[tf.Tensor, Dict[str, Any]]:
+        """
+        Expands tensors from [batch_size, ...] to [batch_size * expand_size, ...] or [batch_size, expand_size, ...],
+        depending on `expand_in_new_axis`. Beam-based approaches expect this function to be used with
+        `expand_in_new_axis=True`
+        """
+
+        def _expand_tensor(tensor: tf.Tensor):
+            if expand_in_new_axis:
+                shape = shape_list(tensor)
+                return tf.broadcast_to(tensor[:, None], (shape[0], expand_size) + tuple(shape[1:]))
+            else:
+                return tf.repeat(tensor, expand_size, axis=0)
+
+        def _expand_dict_for_generation(dict_to_expand):
+            for key in dict_to_expand:
+                if dict_to_expand[key] is not None and isinstance(dict_to_expand[key], tf.Tensor):
+                    dict_to_expand[key] = _expand_tensor(dict_to_expand[key])
+            return dict_to_expand
+
+        if input_ids is not None:
+            input_ids = _expand_tensor(input_ids)
+
+        model_kwargs = _expand_dict_for_generation(model_kwargs)
+
+        if is_encoder_decoder:
+            if model_kwargs.get("encoder_outputs") is None:
+                raise ValueError("If `is_encoder_decoder` is True, make sure that `encoder_outputs` is defined.")
+            model_kwargs["encoder_outputs"] = _expand_dict_for_generation(model_kwargs["encoder_outputs"])
+
+        return input_ids, model_kwargs
+
+    def _prepare_model_inputs(
+        self,
+        inputs: Optional[tf.Tensor] = None,
+        bos_token_id: Optional[int] = None,
+        model_kwargs: Optional[Dict[str, tf.Tensor]] = None,
+    ) -> Tuple[tf.Tensor, Optional[str], Dict[str, tf.Tensor]]:
+        """
+        This function extracts the model-specific `inputs` for generation.
+        """
+        # 1. retrieve all kwargs that are non-None or non-model input related.
+        # some encoder-decoder models have different names for model and encoder
+        if (
+            self.config.is_encoder_decoder
+            and hasattr(self, "encoder")
+            and hasattr(self.encoder, "main_input_name")
+            and self.encoder.main_input_name != self.main_input_name
+        ):
+            input_name = self.encoder.main_input_name
+        else:
+            input_name = self.main_input_name
+
+        model_kwargs = {k: v for k, v in model_kwargs.items() if v is not None or k != input_name}
+
+        # 2. check whether model_input_name is passed as kwarg
+        # if yes and `inputs` is None use kwarg inputs
+        inputs_kwarg = model_kwargs.pop(input_name, None)
+        if inputs_kwarg is not None and inputs is not None:
+            raise ValueError(
+                f"`inputs`: {inputs}` were passed alongside {input_name} which is not allowed. "
+                f"Make sure to either pass {inputs} or {input_name}=..."
+            )
+        elif inputs_kwarg is not None:
+            inputs = inputs_kwarg
+
+        # 3. In the presence of `inputs_embeds` for text models:
+        # - decoder-only models should complain if the user attempts to pass `inputs_embeds`, but the model
+        # doesn't have its forwarding implemented. `inputs_embeds` is kept in `model_kwargs` and can coexist with
+        # input_ids (`inputs_embeds` will be used in the 1st generation step, as opposed to `input_ids`)
+        # - encoder-decoder models should complain if the user attempts to pass `inputs_embeds` and `input_ids`, and
+        # pull the former to inputs. It will be used in place of `input_ids` to get the encoder hidden states.
+        if input_name == "input_ids" and "inputs_embeds" in model_kwargs:
+            if not self.config.is_encoder_decoder:
+                has_inputs_embeds_forwarding = "inputs_embeds" in set(
+                    inspect.signature(self.prepare_inputs_for_generation).parameters.keys()
+                )
+                if not has_inputs_embeds_forwarding:
+                    raise ValueError(
+                        f"You passed `inputs_embeds` to `.generate()`, but the model class {self.__class__.__name__} "
+                        "doesn't have its forwarding implemented. See the GPT2 implementation for an example "
+                        "(https://github.com/huggingface/transformers/pull/21405), and feel free to open a PR with it!"
+                    )
+                # In this case, `input_ids` is moved to the `model_kwargs`, so a few automations (like the creation of
+                # the attention mask) can rely on the actual model input.
+                model_kwargs["input_ids"] = self._maybe_initialize_input_ids_for_generation(
+                    inputs, bos_token_id, model_kwargs=model_kwargs
+                )
+            else:
+                if inputs is not None:
+                    raise ValueError("You passed `inputs_embeds` and `input_ids` to `.generate()`. Please pick one.")
+            inputs, input_name = model_kwargs["inputs_embeds"], "inputs_embeds"
+
+        # 4. if `inputs` is still None, try to create `input_ids` from BOS token
+        inputs = self._maybe_initialize_input_ids_for_generation(inputs, bos_token_id, model_kwargs)
+
+        return inputs, input_name, model_kwargs
+
+    def _maybe_initialize_input_ids_for_generation(
+        self,
+        inputs: Optional[tf.Tensor] = None,
+        bos_token_id: Optional[int] = None,
+        model_kwargs: Optional[Dict[str, tf.Tensor]] = None,
+    ) -> tf.Tensor:
+        """Initializes input ids for generation, if necessary."""
+        if inputs is not None:
+            return inputs
+
+        encoder_outputs = model_kwargs.get("encoder_outputs")
+        if self.config.is_encoder_decoder and encoder_outputs is not None:
+            # make dummy input_ids with value -100, as a sanity check ensuring that they won't be used for encoding
+            shape = encoder_outputs.last_hidden_state.shape[:-1]
+            return tf.ones(shape, dtype=tf.int32) * -100
+
+        if bos_token_id is None:
+            raise ValueError("`bos_token_id` has to be defined when no `input_ids` are provided.")
+
+        # If there is some tensor in `model_kwargs`, we can infer the batch size from it. This is helpful with
+        # soft-prompting or in multimodal implementations built on top of decoder-only language models.
+        batch_size = 1
+        for value in model_kwargs.values():
+            if isinstance(value, tf.Tensor):
+                batch_size = value.shape[0]
+                break
+        return tf.ones((batch_size, 1), dtype=tf.int32) * bos_token_id
+
+    @staticmethod
+    def _extract_past_from_model_output(outputs: ModelOutput):
+        past_key_values = None
+        if "past_key_values" in outputs:
+            past_key_values = outputs.past_key_values
+        elif "mems" in outputs:
+            past_key_values = outputs.mems
+        elif "past_buckets_states" in outputs:
+            past_key_values = outputs.past_buckets_states
+        return past_key_values
+
+    def _update_model_kwargs_for_generation(
+        self, outputs: ModelOutput, model_kwargs: Dict[str, Any], is_encoder_decoder: bool = False
+    ) -> Dict[str, Any]:
+        # update past_key_values
+        model_kwargs["past_key_values"] = self._extract_past_from_model_output(outputs)
+
+        # update attention mask
+        if not is_encoder_decoder:
+            if "attention_mask" in model_kwargs:
+                attention_mask = model_kwargs["attention_mask"]
+                model_kwargs["attention_mask"] = tf.concat(
+                    [attention_mask, tf.ones((shape_list(attention_mask)[0], 1), dtype=tf.int32)], axis=-1
+                )
+
+        return model_kwargs
+
+    def _update_model_kwargs_for_xla_generation(
+        self,
+        model_outputs: ModelOutput,
+        model_kwargs: Dict[str, Any],
+        cur_len: int,
+        max_length: int,
+        batch_size: int,
+        is_encoder_decoder: bool = False,
+        batch_axis: int = 0,
+    ):
+        def _initialize_attention(model_kwargs, num_padding_values, is_encoder_decoder):
+            """initializes the appropriate attention mask -- encoder-decoder models use `decoder_attention_mask`"""
+            if is_encoder_decoder:
+                # One 1 for decoder_start_token_id, 0s for the currently-unfilled locations in the past_key_values tensor,
+                # 1s for the actual input_ids
+                decoder_attention_mask = tf.concat(
+                    [
+                        tf.ones((batch_size, 1), dtype=tf.int32),
+                        tf.zeros((batch_size, num_padding_values), dtype=tf.int32),
+                        tf.ones((batch_size, 1), dtype=tf.int32),
+                    ],
+                    axis=1,
+                )
+                mask = {"decoder_attention_mask": decoder_attention_mask}
+            else:
+                attention_mask = model_kwargs.pop("attention_mask")
+                # 0s for the currently-unfilled locations in the past_key_values tensor, 1s for the actual input_ids
+                attention_mask = tf.concat(
+                    [
+                        attention_mask,
+                        tf.zeros((batch_size, num_padding_values), dtype=attention_mask.dtype),
+                        tf.ones((batch_size, 1), dtype=attention_mask.dtype),
+                    ],
+                    axis=1,
+                )
+                mask = {"attention_mask": attention_mask}
+            return mask
+
+        def _update_attention(model_kwargs, new_past_index, is_encoder_decoder):
+            """updates the appropriate attention mask -- encoder-decoder models use `decoder_attention_mask`"""
+            update_start = tf.constant([0, 1], dtype=tf.int32) * new_past_index
+            if is_encoder_decoder:
+                decoder_attention_mask = model_kwargs.pop("decoder_attention_mask")
+                decoder_attention_mask_update_slice = tf.ones((batch_size, 1), dtype=decoder_attention_mask.dtype)
+                decoder_attention_mask = dynamic_update_slice(
+                    decoder_attention_mask, decoder_attention_mask_update_slice, update_start
+                )
+                mask = {"decoder_attention_mask": decoder_attention_mask}
+            else:
+                attention_mask = model_kwargs.pop("attention_mask")
+                attention_mask_update_slice = tf.ones((batch_size, 1), dtype=attention_mask.dtype)
+                attention_mask = dynamic_update_slice(attention_mask, attention_mask_update_slice, update_start)
+                mask = {"attention_mask": attention_mask}
+            return mask
+
+        def _initialize_past(past_key_values, num_padding_values, batch_axis):
+            """initialize past_key_values with zeros -- the structure depends on `batch_axis`"""
+            if batch_axis == 0:
+                padding_values = tf.constant([[0, 0], [0, 0], [0, num_padding_values], [0, 0]], dtype=tf.int32)
+                new_past = ()
+                for past_layer in past_key_values:
+                    new_past_layer = list(past_layer)
+                    for i in range(len(new_past_layer[:2])):
+                        new_past_layer[i] = tf.pad(past_layer[i], padding_values)
+                    new_past += (tuple(new_past_layer),)
+            else:
+                padding_values = tf.scatter_nd(indices=[[3, 1]], updates=[num_padding_values], shape=(5, 2))
+                new_past = list(past_key_values)
+                for i in range(len(past_key_values)):
+                    new_past[i] = tf.pad(past_key_values[i], padding_values)
+            return new_past
+
+        def _update_past(past_key_values, new_past_index, batch_axis):
+            if batch_axis == 0:
+                slice_start_base = tf.constant([0, 0, 1, 0])
+                new_past = ()
+                for past_layer in past_key_values:
+                    new_past_layer = list(past_layer)
+                    for i in range(len(new_past_layer[:2])):
+                        update_slice = past_layer[i][:, :, -1:]
+                        # Write the last slice to the first open location in the padded past_key_values array
+                        # and then truncate the last slice off the array
+                        new_past_layer[i] = dynamic_update_slice(
+                            past_layer[i][:, :, :-1], update_slice, slice_start_base * new_past_index
+                        )
+                    new_past += (tuple(new_past_layer),)
+            else:
+                slice_start_base = tf.constant([0, 0, 0, 1, 0])
+                new_past = [None for _ in range(len(past_key_values))]
+                for i in range(len(past_key_values)):
+                    update_slice = past_key_values[i][:, :, :, -1:]
+                    # Write the last slice to the first open location in the padded past_key_values array
+                    # and then truncate the last slice off the array
+                    new_past[i] = dynamic_update_slice(
+                        past_key_values[i][:, :, :, :-1], update_slice, slice_start_base * new_past_index
+                    )
+            return new_past
+
+        past_key_values = self._extract_past_from_model_output(model_outputs)
+        if past_key_values is None:
+            raise ValueError(
+                "No known `past_key_values variable` found in model outputs (model outputs keys:"
+                f" {list(model_outputs.keys())})"
+            )
+        is_past_initialized = model_kwargs.pop("past_key_values", None) is not None
+
+        if not is_past_initialized:
+            # The padded version of `past_key_values` has a length of `max_length - 1`, as `past_key_values` holds information relative to
+            # previous autoregressive generation steps (step 0 has no past_key_values, step 1 has 1 past_key_values value, ..., the last step
+            # has `max_length - 1` past_key_values values).
+            num_padding_values = max_length - cur_len - 1
+            mask = _initialize_attention(model_kwargs, num_padding_values, is_encoder_decoder)
+            new_past = _initialize_past(past_key_values, num_padding_values, batch_axis)
+        else:
+            # The new index of past_key_values to be filled corresponds to the current length of the sequence, with two
+            # subtractions: -1 because past_key_values holds information regarding previous generation steps (read comment above)
+            # and -1 again because in an array the index is the length of the array minus 1.
+            new_past_index = cur_len - 2
+            mask = _update_attention(model_kwargs, new_past_index, is_encoder_decoder)
+            new_past = _update_past(past_key_values, new_past_index, batch_axis)
+
+        # sets the updated variables (mask and past_key_values)
+        model_kwargs.update(mask)
+        model_kwargs["past_key_values"] = tuple(new_past)
+
+        return model_kwargs
+
+    def _get_logits_warper(
+        self,
+        generation_config: GenerationConfig,
+    ) -> TFLogitsProcessorList:
+        """
+        This class returns a [`TFLogitsProcessorList`] list object that contains all relevant [`TFLogitsWarper`]
+        instances used for multinomial sampling.
+        """
+
+        # instantiate warpers list
+        warpers = TFLogitsProcessorList()
+
+        # In beam methods, we need to keep at least one non-eos token to explore continuations that might have a
+        # better score (i.e. keep len(generation_config.eos_token_id) + 1)
+        if generation_config.num_beams > 1:
+            if isinstance(generation_config.eos_token_id, list):
+                min_tokens_to_keep = len(generation_config.eos_token_id) + 1
+            else:
+                min_tokens_to_keep = 2
+        else:
+            min_tokens_to_keep = 1
+
+        if generation_config.temperature is not None and generation_config.temperature != 1.0:
+            warpers.append(TFTemperatureLogitsWarper(generation_config.temperature))
+        if generation_config.top_k is not None and generation_config.top_k != 0:
+            warpers.append(TFTopKLogitsWarper(top_k=generation_config.top_k, min_tokens_to_keep=min_tokens_to_keep))
+        if generation_config.top_p is not None and generation_config.top_p < 1.0:
+            warpers.append(TFTopPLogitsWarper(top_p=generation_config.top_p, min_tokens_to_keep=min_tokens_to_keep))
+        return warpers
+
+    def _get_logits_processor(
+        self,
+        generation_config: GenerationConfig,
+        input_ids_seq_length: int,
+        logits_processor: Optional[TFLogitsProcessorList],
+    ) -> TFLogitsProcessorList:
+        """
+        This class returns a [`TFLogitsProcessorList`] list object that contains all relevant [`TFLogitsProcessor`]
+        instances used to modify the scores of the language model head.
+        """
+        processors = TFLogitsProcessorList()
+
+        # instantiate processors list
+        if generation_config.repetition_penalty is not None and generation_config.repetition_penalty != 1.0:
+            processors.append(TFRepetitionPenaltyLogitsProcessor(penalty=generation_config.repetition_penalty))
+        if generation_config.no_repeat_ngram_size is not None and generation_config.no_repeat_ngram_size > 0:
+            processors.append(TFNoRepeatNGramLogitsProcessor(generation_config.no_repeat_ngram_size))
+        if generation_config.bad_words_ids is not None:
+            processors.append(
+                TFNoBadWordsLogitsProcessor(generation_config.bad_words_ids, generation_config.eos_token_id)
+            )
+        if (
+            generation_config.min_length is not None
+            and generation_config.eos_token_id is not None
+            and generation_config.min_length > 0
+        ):
+            processors.append(TFMinLengthLogitsProcessor(generation_config.min_length, generation_config.eos_token_id))
+        if generation_config.forced_bos_token_id is not None:
+            processors.append(TFForcedBOSTokenLogitsProcessor(generation_config.forced_bos_token_id))
+        if generation_config.forced_eos_token_id is not None:
+            processors.append(
+                TFForcedEOSTokenLogitsProcessor(generation_config.max_length, generation_config.forced_eos_token_id)
+            )
+        if generation_config.suppress_tokens is not None:
+            processors.append(TFSuppressTokensLogitsProcessor(generation_config.suppress_tokens))
+        if generation_config.begin_suppress_tokens is not None:
+            begin_index = input_ids_seq_length
+            begin_index = (
+                begin_index
+                if (input_ids_seq_length > 1 or generation_config.forced_bos_token_id is None)
+                else begin_index + 1
+            )
+            if generation_config.forced_decoder_ids is not None:
+                begin_index += generation_config.forced_decoder_ids[-1][
+                    0
+                ]  # generation starts after the last token that is forced
+            processors.append(
+                TFSuppressTokensAtBeginLogitsProcessor(generation_config.begin_suppress_tokens, begin_index)
+            )
+        if generation_config.forced_decoder_ids is not None:
+            processors.append(TFForceTokensLogitsProcessor(generation_config.forced_decoder_ids))
+
+        processors = self._merge_criteria_processor_list(processors, logits_processor)
+        return processors
+
+    def _merge_criteria_processor_list(
+        self,
+        default_list: TFLogitsProcessorList,
+        custom_list: TFLogitsProcessorList,
+    ) -> TFLogitsProcessorList:
+        if len(custom_list) == 0:
+            return default_list
+        for default in default_list:
+            for custom in custom_list:
+                if type(custom) is type(default):
+                    object_type = "logits processor"
+                    raise ValueError(
+                        f"A custom {object_type} of type {type(custom)} with values {custom} has been passed to"
+                        f" `generate`, but it has already been created with the values {default}. {default} has been"
+                        " created by passing the corresponding arguments to generate or by the model's config default"
+                        f" values. If you just want to change the default values of {object_type} consider passing"
+                        f" them as arguments to `generate` instead of using a custom {object_type}."
+                    )
+        default_list.extend(custom_list)
+        return default_list
+
+    def greedy_search(
+        self,
+        input_ids: tf.Tensor,
+        max_length: Optional[int] = None,
+        pad_token_id: Optional[int] = None,
+        eos_token_id: Optional[int] = None,
+        logits_processor: Optional[TFLogitsProcessorList] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_scores: Optional[bool] = None,
+        return_dict_in_generate: Optional[bool] = None,
+        **model_kwargs,
+    ) -> Union[TFGreedySearchOutput, tf.Tensor]:
+        r"""
+        Generates sequences for models with a language modeling head using greedy decoding.
+
+        Parameters:
+            input_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`):
+                The sequence used as a prompt for the generation.
+            logits_processor (`TFLogitsProcessorList`, *optional*):
+                An instance of [`TFLogitsProcessorList`]. List of instances of class derived from [`TFLogitsProcessor`]
+                used to modify the prediction scores of the language modeling head applied at each generation step.
+            max_length (`int`, *optional*, defaults to 20):
+                The maximum length of the sequence to be generated.
+            pad_token_id (`int`, *optional*):
+                The id of the *padding* token.
+            eos_token_id (`Union[int, List[int]]`, *optional*):
+                The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
+            output_attentions (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more details.
+            output_hidden_states (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more details.
+            output_scores (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the prediction scores. See `scores` under returned tensors for more details.
+            return_dict_in_generate (`bool`, *optional*, defaults to `False`):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+            model_kwargs:
+                Additional model specific keyword arguments will be forwarded to the `call` function of the model. If
+                model is an encoder-decoder model the kwargs should include `encoder_outputs`.
+
+        Return:
+            [`~generation.TFGreedySearchDecoderOnlyOutput`], [`~generation.TFGreedySearchEncoderDecoderOutput`] or
+            `tf.Tensor`: A `tf.Tensor` containing the generated tokens (default behaviour) or a
+            [`~generation.TFGreedySearchDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and
+            `return_dict_in_generate=True` or a [`~generation.TFGreedySearchEncoderDecoderOutput`] if
+            `model.config.is_encoder_decoder=True`.
+
+        Examples:
+
+        ```python
+        >>> from transformers import (
+        ...     AutoTokenizer,
+        ...     TFAutoModelForCausalLM,
+        ...     TFLogitsProcessorList,
+        ...     TFMinLengthLogitsProcessor,
+        ... )
+
+        >>> tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
+        >>> model = TFAutoModelForCausalLM.from_pretrained("openai-community/gpt2")
+
+        >>> # set pad_token_id to eos_token_id because GPT2 does not have a PAD token
+        >>> model.generation_config.pad_token_id = model.generation_config.eos_token_id
+
+        >>> input_prompt = "Today is a beautiful day, and"
+        >>> input_ids = tokenizer(input_prompt, return_tensors="tf").input_ids
+
+        >>> # instantiate logits processors
+        >>> logits_processor = TFLogitsProcessorList(
+        ...     [
+        ...         TFMinLengthLogitsProcessor(15, eos_token_id=model.generation_config.eos_token_id),
+        ...     ]
+        ... )
+
+        >>> outputs = model.greedy_search(input_ids, logits_processor=logits_processor)
+        >>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
+        ["Today is a beautiful day, and I'm so happy to be here. I'm so happy to"]
+        ```"""
+
+        # 1. init greedy_search values
+        logits_processor = logits_processor if logits_processor is not None else TFLogitsProcessorList()
+
+        max_length = max_length if max_length is not None else self.generation_config.max_length
+        pad_token_id = pad_token_id if pad_token_id is not None else self.generation_config.pad_token_id
+        eos_token_id = eos_token_id if eos_token_id is not None else self.generation_config.eos_token_id
+        if isinstance(eos_token_id, int):
+            eos_token_id = [eos_token_id]
+        output_scores = output_scores if output_scores is not None else self.generation_config.output_scores
+        output_attentions = (
+            output_attentions if output_attentions is not None else self.generation_config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.generation_config.output_hidden_states
+        )
+        return_dict_in_generate = (
+            return_dict_in_generate
+            if return_dict_in_generate is not None
+            else self.generation_config.return_dict_in_generate
+        )
+        use_cache = model_kwargs.pop("use_cache", self.generation_config.use_cache)
+        use_xla = not tf.executing_eagerly()
+        # TODO (Joao): fix cache format or find programatic way to detect cache index
+        # GPT2 and other models has a slightly different cache structure, with a different batch axis
+        model_name = str(self.decoder) if "EncoderDecoder" in str(self) else str(self)
+        cache_batch_axis = 1 if any(model_prefix in model_name for model_prefix in ("TFGPT2", "TFCTRL")) else 0
+        # some models, like XLNet, need more than the last token in the presence of past_key_values
+        needs_full_input = "use_mems" in set(inspect.signature(self.prepare_inputs_for_generation).parameters.keys())
+
+        # 2. init `attentions`, `hidden_states`, and `scores` tuples
+        scores = [] if (return_dict_in_generate and output_scores) else None
+        decoder_attentions = [] if (return_dict_in_generate and output_attentions) else None
+        cross_attentions = [] if (return_dict_in_generate and output_attentions) else None
+        decoder_hidden_states = [] if (return_dict_in_generate and output_hidden_states) else None
+
+        # 3. init tensors to use for "xla-compileable" generate function
+        batch_size, cur_len = shape_list(input_ids)
+
+        # initialize `generated` (`input_ids` padded with `pad_token_id`), `finished_sequences`
+        input_ids_padding = tf.ones((batch_size, max_length - cur_len), dtype=tf.int32) * (pad_token_id or 0)
+        generated = tf.concat([input_ids, input_ids_padding], axis=-1)
+        finished_sequences = tf.zeros((batch_size,), dtype=tf.bool)
+
+        # 4. define "xla-compile-able" stop-condition and auto-regressive function
+        # define condition fn
+        def greedy_search_cond_fn(generated, finished_sequences, cur_len, model_kwargs):
+            """state termination condition fn."""
+            return ~tf.reduce_all(finished_sequences)
+
+        # define condition fn
+        def greedy_search_body_fn(generated, finished_sequences, cur_len, model_kwargs):
+            """state update fn."""
+            if model_kwargs.get("past_key_values") is None or needs_full_input:
+                input_ids = generated[:, :cur_len]
+            else:
+                input_ids = tf.expand_dims(generated[:, cur_len - 1], -1)
+            model_inputs = self.prepare_inputs_for_generation(input_ids, use_cache=use_cache, **model_kwargs)
+            # forward pass to get next token logits
+            model_outputs = self(
+                **model_inputs,
+                return_dict=True,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+            )
+            next_token_logits = model_outputs.logits[:, -1]
+
+            # pre-process distribution
+            next_tokens_scores = logits_processor(generated, next_token_logits, cur_len)
+
+            # Store scores, attentions and hidden_states when required
+            if not use_xla and return_dict_in_generate:
+                if output_scores:
+                    scores.append(next_tokens_scores)
+                if output_attentions and self.config.is_encoder_decoder:
+                    decoder_attentions.append(model_outputs.decoder_attentions)
+                elif output_attentions and not self.config.is_encoder_decoder:
+                    decoder_attentions.append(model_outputs.attentions)
+                    if self.config.is_encoder_decoder:
+                        cross_attentions.append(model_outputs.cross_attentions)
+
+                if output_hidden_states and self.config.is_encoder_decoder:
+                    decoder_hidden_states.append(model_outputs.decoder_hidden_states)
+                elif output_hidden_states and self.config.is_encoder_decoder:
+                    decoder_hidden_states.append(model_outputs.hidden_states)
+
+            # argmax
+            next_tokens = tf.argmax(next_tokens_scores, axis=-1, output_type=tf.int32)
+
+            if eos_token_id is not None:
+                if pad_token_id is None:
+                    raise ValueError("If `eos_token_id` is defined, make sure that `pad_token_id` is defined.")
+                unfinished_seq = 1 - tf.cast(finished_sequences, tf.int32)
+                next_tokens = next_tokens * unfinished_seq + pad_token_id * (1 - unfinished_seq)
+                next_token_is_eos = tf.math.reduce_any(
+                    tf.equal(
+                        tf.broadcast_to(next_tokens, (len(eos_token_id), batch_size)), tf.expand_dims(eos_token_id, -1)
+                    ),
+                    axis=0,
+                )
+                finished_sequences = finished_sequences | next_token_is_eos
+
+            # update `generated` and `cur_len`
+            update_indices = tf.stack([tf.range(batch_size), tf.broadcast_to(cur_len, [batch_size])], axis=-1)
+            generated = tf.tensor_scatter_nd_update(tensor=generated, indices=update_indices, updates=next_tokens)
+            cur_len += 1
+
+            # update model_kwargs
+            if use_xla:
+                model_kwargs = self._update_model_kwargs_for_xla_generation(
+                    model_outputs=model_outputs,
+                    model_kwargs=model_kwargs,
+                    cur_len=cur_len,
+                    max_length=max_length,
+                    batch_size=batch_size,
+                    is_encoder_decoder=self.config.is_encoder_decoder,
+                    batch_axis=cache_batch_axis,
+                )
+            else:
+                model_kwargs = self._update_model_kwargs_for_generation(
+                    model_outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder
+                )
+                # if we don't cache past_key_values key values we need the whole input
+                if model_kwargs.get("past_key_values", None) is None:
+                    # let's throw out `past_key_values` since we don't want `None` tensors
+                    model_kwargs.pop("past_key_values", None)
+
+            return generated, finished_sequences, cur_len, model_kwargs
+
+        # 5. run generation
+        # 1st generation step has to be run before to initialize `past_key_values`
+        generated, finished_sequences, cur_len, model_kwargs = greedy_search_body_fn(
+            generated, finished_sequences, cur_len, model_kwargs
+        )
+
+        # 2-to-n generation steps can then be run in autoregressive fashion
+        # only in case 1st generation step does NOT yield EOS token though
+        maximum_iterations = max_length - cur_len
+        generated, _, cur_len, _ = tf.while_loop(
+            greedy_search_cond_fn,
+            greedy_search_body_fn,
+            (generated, finished_sequences, cur_len, model_kwargs),
+            maximum_iterations=maximum_iterations,
+        )
+
+        # 6. prepare outputs
+        if not use_xla:
+            # cut for backward compatibility
+            generated = generated[:, :cur_len]
+
+        if return_dict_in_generate:
+            if self.config.is_encoder_decoder:
+                # if model is an encoder-decoder, retrieve encoder attention weights
+                # and hidden states
+                encoder_attentions = model_kwargs["encoder_outputs"].get("attentions") if output_attentions else None
+                encoder_hidden_states = (
+                    model_kwargs["encoder_outputs"].get("hidden_states") if output_hidden_states else None
+                )
+
+                scores = tuple(scores) if scores is not None else None
+                decoder_attentions = tuple(decoder_attentions) if decoder_attentions is not None else None
+                cross_attentions = tuple(cross_attentions) if cross_attentions is not None else None
+                decoder_hidden_states = tuple(decoder_hidden_states) if decoder_hidden_states is not None else None
+
+                return TFGreedySearchEncoderDecoderOutput(
+                    sequences=generated,
+                    scores=scores,
+                    encoder_attentions=encoder_attentions,
+                    encoder_hidden_states=encoder_hidden_states,
+                    decoder_attentions=decoder_attentions,
+                    cross_attentions=cross_attentions,
+                    decoder_hidden_states=decoder_hidden_states,
+                )
+            else:
+                return TFGreedySearchDecoderOnlyOutput(
+                    sequences=generated,
+                    scores=scores,
+                    attentions=decoder_attentions,
+                    hidden_states=decoder_hidden_states,
+                )
+        else:
+            return generated
+
+    def sample(
+        self,
+        input_ids: tf.Tensor,
+        logits_processor: Optional[TFLogitsProcessorList] = None,
+        logits_warper: Optional[TFLogitsProcessorList] = None,
+        max_length: Optional[int] = None,
+        pad_token_id: Optional[int] = None,
+        eos_token_id: Optional[int] = None,
+        seed: Optional[Tuple[int, int]] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_scores: Optional[bool] = None,
+        return_dict_in_generate: Optional[bool] = None,
+        **model_kwargs,
+    ) -> Union[TFSampleOutput, tf.Tensor]:
+        r"""
+        Generates sequences for models with a language modeling head using multinomial sampling.
+
+        Parameters:
+            input_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`):
+                The sequence used as a prompt for the generation.
+            logits_processor (`TFLogitsProcessorList`, *optional*):
+                An instance of [`TFLogitsProcessorList`]. List of instances of class derived from [`TFLogitsProcessor`]
+                used to modify the prediction scores of the language modeling head applied at each generation step.
+            logits_warper (`TFLogitsProcessorList`, *optional*):
+                An instance of [`TFLogitsProcessorList`]. List of instances of class derived from [`TFLogitsWarper`]
+                used to warp the prediction score distribution of the language modeling head applied before multinomial
+                sampling at each generation step.
+            max_length (`int`, *optional*, defaults to 20):
+                The maximum length of the sequence to be generated.
+            pad_token_id (`int`, *optional*):
+                The id of the *padding* token.
+            eos_token_id (`Union[int, List[int]]`, *optional*):
+                The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
+            seed (`List[int]`, *optional*):
+                Random seed to control sampling, containing two integers, used when `do_sample` is `True`. See the
+                `seed` argument from stateless functions in `tf.random`.
+            output_attentions (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more details.
+            output_hidden_states (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more details.
+            output_scores (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the prediction scores. See `scores` under returned tensors for more details.
+            return_dict_in_generate (`bool`, *optional*, defaults to `False`):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+            model_kwargs:
+                Additional model specific kwargs will be forwarded to the `call` function of the model. If model is an
+                encoder-decoder model the kwargs should include `encoder_outputs`.
+
+        Return:
+            [`~generation.TFSampleDecoderOnlyOutput`], [`~generation.TFSampleEncoderDecoderOutput`] or `tf.Tensor`: A
+            `tf.Tensor` containing the generated tokens (default behaviour) or a
+            [`~generation.TFSampleDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and
+            `return_dict_in_generate=True` or a [`~generation.TFSampleEncoderDecoderOutput`] if
+            `model.config.is_encoder_decoder=True`.
+
+        Examples:
+
+        ```python
+        >>> import tensorflow as tf
+        >>> from transformers import (
+        ...     AutoTokenizer,
+        ...     TFAutoModelForCausalLM,
+        ...     TFLogitsProcessorList,
+        ...     TFMinLengthLogitsProcessor,
+        ...     TFTopKLogitsWarper,
+        ...     TFTemperatureLogitsWarper,
+        ... )
+
+        >>> tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
+        >>> model = TFAutoModelForCausalLM.from_pretrained("openai-community/gpt2")
+
+        >>> # set pad_token_id to eos_token_id because GPT2 does not have a EOS token
+        >>> model.generation_config.pad_token_id = model.generation_config.eos_token_id
+
+        >>> input_prompt = "Today is a beautiful day, and"
+        >>> input_ids = tokenizer(input_prompt, return_tensors="tf").input_ids
+
+        >>> # instantiate logits processors
+        >>> logits_processor = TFLogitsProcessorList(
+        ...     [
+        ...         TFMinLengthLogitsProcessor(15, eos_token_id=model.generation_config.eos_token_id),
+        ...     ]
+        ... )
+        >>> # instantiate logits processors
+        >>> logits_warper = TFLogitsProcessorList(
+        ...     [
+        ...         TFTopKLogitsWarper(50),
+        ...         TFTemperatureLogitsWarper(0.7),
+        ...     ]
+        ... )
+
+        >>> tf.random.set_seed(0)
+        >>> outputs = model.sample(input_ids, logits_processor=logits_processor, logits_warper=logits_warper)
+
+        >>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
+        ['Today is a beautiful day, and I love my country. But when I look at Donald Trump,']
+        ```"""
+
+        # 1. init greedy_search values
+        logits_processor = logits_processor if logits_processor is not None else TFLogitsProcessorList()
+        logits_warper = logits_warper if logits_warper is not None else TFLogitsProcessorList()
+
+        max_length = max_length if max_length is not None else self.generation_config.max_length
+        pad_token_id = pad_token_id if pad_token_id is not None else self.generation_config.pad_token_id
+        eos_token_id = eos_token_id if eos_token_id is not None else self.generation_config.eos_token_id
+        if isinstance(eos_token_id, int):
+            eos_token_id = [eos_token_id]
+        output_scores = output_scores if output_scores is not None else self.generation_config.output_scores
+        output_attentions = (
+            output_attentions if output_attentions is not None else self.generation_config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.generation_config.output_hidden_states
+        )
+        return_dict_in_generate = (
+            return_dict_in_generate
+            if return_dict_in_generate is not None
+            else self.generation_config.return_dict_in_generate
+        )
+        use_cache = model_kwargs.pop("use_cache", self.generation_config.use_cache)
+        use_xla = not tf.executing_eagerly()
+        # TODO (Joao): fix cache format or find programatic way to detect cache index
+        # GPT2 and other models has a slightly different cache structure, with a different batch axis
+        model_name = str(self.decoder) if "EncoderDecoder" in str(self) else str(self)
+        cache_batch_axis = 1 if any(model_prefix in model_name for model_prefix in ("TFGPT2", "TFCTRL")) else 0
+        # some models, like XLNet, need more than the last token in the presence of past_key_values
+        needs_full_input = "use_mems" in set(inspect.signature(self.prepare_inputs_for_generation).parameters.keys())
+
+        # 2. init `attentions`, `hidden_states`, and `scores` tuples
+        scores = [] if (return_dict_in_generate and output_scores) else None
+        decoder_attentions = [] if (return_dict_in_generate and output_attentions) else None
+        cross_attentions = [] if (return_dict_in_generate and output_attentions) else None
+        decoder_hidden_states = [] if (return_dict_in_generate and output_hidden_states) else None
+
+        # 3. init tensors to use for "xla-compileable" generate function
+        batch_size, cur_len = shape_list(input_ids)
+
+        # initialize `generated` (pre-populated with `pad_token_id`), `finished_sequences`
+        input_ids_padding = tf.ones((batch_size, max_length - cur_len), dtype=tf.int32) * (pad_token_id or 0)
+        generated = tf.concat([input_ids, input_ids_padding], axis=-1)
+        finished_sequences = tf.zeros((batch_size,), dtype=tf.bool)
+
+        # 4. define "xla-compile-able" stop-condition and auto-regressive function
+        def sample_cond_fn(generated, finished_sequences, cur_len, model_kwargs):
+            return ~tf.reduce_all(finished_sequences)
+
+        def sample_body_fn(generated, finished_sequences, cur_len, model_kwargs):
+            if model_kwargs.get("past_key_values") is None or needs_full_input:
+                input_ids = generated[:, :cur_len]
+            else:
+                input_ids = tf.expand_dims(generated[:, cur_len - 1], -1)
+            model_inputs = self.prepare_inputs_for_generation(input_ids, use_cache=use_cache, **model_kwargs)
+            # forward pass to get next token logits
+            model_outputs = self(
+                **model_inputs,
+                return_dict=True,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+            )
+            next_token_logits = model_outputs.logits[:, -1]
+
+            # pre-process distribution
+            next_tokens_scores = logits_processor(generated, next_token_logits, cur_len)
+            next_tokens_scores = logits_warper(generated, next_tokens_scores, cur_len)
+
+            # Store scores, attentions and hidden_states when required
+            if not use_xla and return_dict_in_generate:
+                if output_scores:
+                    scores.append(next_tokens_scores)
+                if output_attentions and self.config.is_encoder_decoder:
+                    decoder_attentions.append(model_outputs.decoder_attentions)
+                elif output_attentions and not self.config.is_encoder_decoder:
+                    decoder_attentions.append(model_outputs.attentions)
+                    if self.config.is_encoder_decoder:
+                        cross_attentions.append(model_outputs.cross_attentions)
+
+                if output_hidden_states and self.config.is_encoder_decoder:
+                    decoder_hidden_states.append(model_outputs.decoder_hidden_states)
+                elif output_hidden_states and self.config.is_encoder_decoder:
+                    decoder_hidden_states.append(model_outputs.hidden_states)
+
+            # sample
+            if seed is not None:
+                sample_seed = seed
+            else:
+                sample_seed = tf.experimental.numpy.random.randint(tf.int32.min, tf.int32.max, (2,), dtype=tf.int32)
+            next_tokens = tf.squeeze(
+                tf.random.stateless_categorical(
+                    logits=next_tokens_scores, num_samples=1, seed=sample_seed, dtype=tf.int32
+                ),
+                axis=1,
+            )
+
+            if eos_token_id is not None:
+                if pad_token_id is None:
+                    raise ValueError("If `eos_token_id` is defined, make sure that `pad_token_id` is defined.")
+                unfinished_seq = 1 - tf.cast(finished_sequences, tf.int32)
+                next_tokens = next_tokens * unfinished_seq + pad_token_id * (1 - unfinished_seq)
+                next_token_is_eos = tf.math.reduce_any(
+                    tf.equal(
+                        tf.broadcast_to(next_tokens, (len(eos_token_id), batch_size)), tf.expand_dims(eos_token_id, -1)
+                    ),
+                    axis=0,
+                )
+                finished_sequences = finished_sequences | next_token_is_eos
+
+            # update `generated` and `cur_len`
+            update_indices = tf.stack([tf.range(batch_size), tf.broadcast_to(cur_len, [batch_size])], axis=-1)
+            generated = tf.tensor_scatter_nd_update(tensor=generated, indices=update_indices, updates=next_tokens)
+            cur_len += 1
+
+            # update model_kwargs
+            if use_xla:
+                model_kwargs = self._update_model_kwargs_for_xla_generation(
+                    model_outputs=model_outputs,
+                    model_kwargs=model_kwargs,
+                    cur_len=cur_len,
+                    max_length=max_length,
+                    batch_size=batch_size,
+                    is_encoder_decoder=self.config.is_encoder_decoder,
+                    batch_axis=cache_batch_axis,
+                )
+            else:
+                model_kwargs = self._update_model_kwargs_for_generation(
+                    model_outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder
+                )
+                # if we don't cache past_key_values key values we need the whole input
+                if model_kwargs.get("past_key_values", None) is None:
+                    # let's throw out `past_key_values` since we don't want `None` tensors
+                    model_kwargs.pop("past_key_values", None)
+
+            return generated, finished_sequences, cur_len, model_kwargs
+
+        # 5. run generation
+        # 1st generation step has to be run before to initialize `past_key_values`
+        generated, finished_sequences, cur_len, model_kwargs = sample_body_fn(
+            generated, finished_sequences, cur_len, model_kwargs
+        )
+
+        # 2-to-n generation steps can then be run in autoregressive fashion
+        # only in case 1st generation step does NOT yield EOS token though
+        maximum_iterations = max_length - cur_len
+        generated, _, cur_len, _ = tf.while_loop(
+            sample_cond_fn,
+            sample_body_fn,
+            (generated, finished_sequences, cur_len, model_kwargs),
+            maximum_iterations=maximum_iterations,
+        )
+
+        # 6. prepare outputs
+        if not use_xla:
+            # cut for backward compatibility
+            generated = generated[:, :cur_len]
+
+        if return_dict_in_generate:
+            if self.config.is_encoder_decoder:
+                # if model is an encoder-decoder, retrieve encoder attention weights
+                # and hidden states
+                encoder_attentions = model_kwargs["encoder_outputs"].get("attentions") if output_attentions else None
+                encoder_hidden_states = (
+                    model_kwargs["encoder_outputs"].get("hidden_states") if output_hidden_states else None
+                )
+
+                scores = tuple(scores) if scores is not None else None
+                decoder_attentions = tuple(decoder_attentions) if decoder_attentions is not None else None
+                cross_attentions = tuple(cross_attentions) if cross_attentions is not None else None
+                decoder_hidden_states = tuple(decoder_hidden_states) if decoder_hidden_states is not None else None
+
+                return TFSampleEncoderDecoderOutput(
+                    sequences=generated,
+                    scores=scores,
+                    encoder_attentions=encoder_attentions,
+                    encoder_hidden_states=encoder_hidden_states,
+                    decoder_attentions=decoder_attentions,
+                    cross_attentions=cross_attentions,
+                    decoder_hidden_states=decoder_hidden_states,
+                )
+            else:
+                return TFSampleDecoderOnlyOutput(
+                    sequences=generated,
+                    scores=scores,
+                    attentions=decoder_attentions,
+                    hidden_states=decoder_hidden_states,
+                )
+        else:
+            return generated
+
+    @staticmethod
+    def _gather_beams(nested, beam_indices, batch_axis=0):
+        """Gathers the beam slices indexed by beam_indices into new beam array."""
+
+        def gather_fn(tensor):
+            if batch_axis > 0:
+                # pushes all dimentions before the batch to the end, so we get (batch, beam_id, ...)
+                perm = tf.concat((tf.range(tf.rank(tensor))[batch_axis:], tf.range(batch_axis)), axis=0)
+                tensor = tf.transpose(tensor, perm=perm)
+
+            gathered_tensor = tf.gather(params=tensor, indices=beam_indices, axis=1, batch_dims=1)
+            if batch_axis > 0:
+                # transposes back to the original dimensions
+                perm = tf.concat((tf.range(tf.rank(tensor))[batch_axis:], tf.range(batch_axis)), axis=0)
+                perm = tf.math.invert_permutation(perm)
+                gathered_tensor = tf.transpose(gathered_tensor, perm=perm)
+
+            return gathered_tensor
+
+        return tf.nest.map_structure(gather_fn, nested)
+
+    def beam_search(
+        self,
+        input_ids: tf.Tensor,
+        do_sample: bool = False,
+        max_length: Optional[int] = None,
+        pad_token_id: Optional[int] = None,
+        eos_token_id: Optional[int] = None,
+        length_penalty: Optional[float] = None,
+        early_stopping: Optional[Union[bool, str]] = None,
+        logits_processor: Optional[TFLogitsProcessorList] = None,
+        logits_warper: Optional[TFLogitsProcessorList] = None,
+        num_return_sequences: Optional[int] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_scores: Optional[bool] = None,
+        return_dict_in_generate: Optional[bool] = None,
+        **model_kwargs,
+    ) -> Union[TFBeamSearchOutput, TFBeamSampleOutput, tf.Tensor]:
+        r"""
+        Generates sequences for models with a language modeling head using beam search. If `do_sample` is `False`, uses
+        a greedy approach, otherwise does multinomial sampling without replacement.
+
+        Parameters:
+            input_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`):
+                The sequence used as a prompt for the generation.
+            do_sample (`bool`, *optional*, defaults to `False`):
+                Whether or not to use sampling ; use greedy decoding otherwise.
+            max_length (`int`, *optional*, defaults to 20):
+                The maximum length of the sequence to be generated.
+            pad_token_id (`int`, *optional*):
+                The id of the *padding* token.
+            eos_token_id (`Union[int, List[int]]`, *optional*):
+                The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
+            length_penalty (`float`, *optional*, defaults to 1.0):
+                Exponential penalty to the length that is used with beam-based generation. It is applied as an exponent
+                to the sequence length, which in turn is used to divide the score of the sequence. Since the score is
+                the log likelihood of the sequence (i.e. negative), `length_penalty` > 0.0 promotes longer sequences,
+                while `length_penalty` < 0.0 encourages shorter sequences.
+            early_stopping (`bool` or `str`, *optional*, defaults to `False`):
+                Controls the stopping condition for beam-based methods, like beam-search. It accepts the following
+                values: `True`, where the generation stops as soon as there are `num_beams` complete candidates;
+                `False`, where an heuristic is applied and the generation stops when is it very unlikely to find better
+                candidates; `"never"`, where the beam search procedure only stops when there cannot be better
+                candidates (canonical beam search algorithm).
+            logits_processor (`[TFLogitsProcessorList]`, *optional*):
+                An instance of [`TFLogitsProcessorList`]. List of instances of class derived from [`TFLogitsProcessor`]
+                used to modify the prediction scores of the language modeling head applied at each generation step.
+            logits_warper (`TFLogitsProcessorList`, *optional*):
+                An instance of [`TFLogitsProcessorList`]. List of instances of class derived from [`TFLogitsWarper`]
+                used to warp the prediction score distribution of the language modeling head applied before multinomial
+                sampling at each generation step.
+            num_return_sequences(`int`, *optional*, defaults to 1):
+                The number of independently computed returned sequences for each element in the batch.
+            output_attentions (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more details.
+            output_hidden_states (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more details.
+            return_dict_in_generate (`bool`, *optional*, defaults to `False`):
+                Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
+            model_kwargs:
+                Additional model specific kwargs will be forwarded to the `call` function of the model. If model is an
+                encoder-decoder model the kwargs should include `encoder_outputs`.
+
+        Return:
+            [`~generation.TFBeamSearchDecoderOnlyOutput`], [`~generation.TFBeamSearchEncoderDecoderOutput`] or
+            `tf.Tensor`: A `tf.Tensor` containing the generated tokens (default behaviour) or a
+            [`~generation.TFBeamSearchDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and
+            `return_dict_in_generate=True` or a [`~generation.TFBeamSearchEncoderDecoderOutput`] if
+            `model.config.is_encoder_decoder=True`.
+
+        Examples:
+
+        ```python
+        >>> from transformers import (
+        ...     AutoTokenizer,
+        ...     TFAutoModelForSeq2SeqLM,
+        ...     TFLogitsProcessorList,
+        ...     TFMinLengthLogitsProcessor,
+        ... )
+        >>> import tensorflow as tf
+
+        >>> tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-base")
+        >>> model = TFAutoModelForSeq2SeqLM.from_pretrained("google-t5/t5-base")
+
+        >>> encoder_input_str = "translate English to German: How old are you?"
+        >>> encoder_input_ids = tokenizer(encoder_input_str, return_tensors="tf").input_ids
+
+        >>> # lets run beam search using 3 beams
+        >>> num_beams = 3
+        >>> # define decoder start token ids
+        >>> input_ids = tf.ones((1, num_beams, 1), dtype=tf.int32)
+        >>> input_ids = input_ids * model.generation_config.decoder_start_token_id
+
+        >>> # add encoder_outputs to model keyword arguments
+        >>> encoder_outputs = model.get_encoder()(encoder_input_ids, return_dict=True)
+        >>> encoder_outputs.last_hidden_state = tf.repeat(
+        ...     tf.expand_dims(encoder_outputs.last_hidden_state, axis=0), num_beams, axis=1
+        ... )
+        >>> model_kwargs = {"encoder_outputs": encoder_outputs}
+
+        >>> # instantiate logits processors
+        >>> logits_processor = TFLogitsProcessorList(
+        ...     [TFMinLengthLogitsProcessor(5, eos_token_id=model.generation_config.eos_token_id)]
+        ... )
+
+        >>> outputs = model.beam_search(input_ids, logits_processor=logits_processor, **model_kwargs)
+        >>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
+        ['Wie alt bist du?']
+        ```"""
+
+        def flatten_beam_dim(tensor, batch_axis=0):
+            """Flattens the first two dimensions of a non-scalar array."""
+            shape = shape_list(tensor)
+            return tf.reshape(
+                tensor,
+                shape[:batch_axis] + [shape[batch_axis] * shape[batch_axis + 1]] + shape[batch_axis + 2 :],
+            )
+
+        def unflatten_beam_dim(tensor, num_beams, batch_axis=0):
+            """Unflattens the first, flat batch*beam dimension of a non-scalar array."""
+            shape = shape_list(tensor)
+            return tf.reshape(tensor, shape[:batch_axis] + [-1, num_beams] + shape[batch_axis + 1 :])
+
+        # 1. init beam_search values
+        logits_processor = logits_processor if logits_processor is not None else TFLogitsProcessorList()
+        logits_warper = logits_warper if logits_warper is not None else TFLogitsProcessorList()
+
+        max_length = max_length if max_length is not None else self.generation_config.max_length
+        pad_token_id = pad_token_id if pad_token_id is not None else self.generation_config.pad_token_id
+        eos_token_id = eos_token_id if eos_token_id is not None else self.generation_config.eos_token_id
+        if isinstance(eos_token_id, int):
+            eos_token_id = [eos_token_id]
+        num_return_sequences = (
+            num_return_sequences if num_return_sequences is not None else self.generation_config.num_return_sequences
+        )
+
+        output_attentions = (
+            output_attentions if output_attentions is not None else self.generation_config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.generation_config.output_hidden_states
+        )
+        output_scores = output_scores if output_scores is not None else self.generation_config.output_scores
+        return_dict_in_generate = (
+            return_dict_in_generate
+            if return_dict_in_generate is not None
+            else self.generation_config.return_dict_in_generate
+        )
+
+        length_penalty = length_penalty if length_penalty is not None else self.generation_config.length_penalty
+        early_stopping = early_stopping if early_stopping is not None else self.generation_config.early_stopping
+
+        use_cache = model_kwargs.pop("use_cache", self.generation_config.use_cache)
+        use_xla = not tf.executing_eagerly()
+        # TODO (Joao): fix cache format or find programatic way to detect cache index
+        # GPT2 and other models has a slightly different cache structure, with a different batch axis
+        model_name = str(self.decoder) if "EncoderDecoder" in str(self) else str(self)
+        cache_batch_axis = 1 if any(model_prefix in model_name for model_prefix in ("TFGPT2", "TFCTRL")) else 0
+        # some models, like XLNet, need more than the last token in the presence of past_key_values
+        needs_full_input = "use_mems" in set(inspect.signature(self.prepare_inputs_for_generation).parameters.keys())
+
+        # 2. init `attentions`, `hidden_states`, and `scores` tuples
+        all_scores = [] if (return_dict_in_generate and output_scores) else None
+        decoder_attentions = [] if (return_dict_in_generate and output_attentions) else None
+        cross_attentions = [] if (return_dict_in_generate and output_attentions) else None
+        decoder_hidden_states = [] if (return_dict_in_generate and output_hidden_states) else None
+
+        # 3. init tensors to use for "xla-compileable" generate function
+        batch_size, num_beams, cur_len = shape_list(input_ids)
+        # store the prompt length of decoder
+        decoder_prompt_len = cur_len
+
+        # per batch, beam-item holding current token in loop, pre-populated with `pad_token_id`
+        input_ids_padding = tf.ones((batch_size, num_beams, max_length - cur_len), dtype=tf.int32) * (
+            pad_token_id or 0
+        )
+        running_sequences = tf.concat([input_ids, input_ids_padding], axis=-1)
+        sequences = tf.ones((batch_size, num_beams, max_length), dtype=tf.int32) * (pad_token_id or 0)
+
+        # per batch,beam-item state bit indicating if sentence has finished.
+        is_sent_finished = tf.zeros((batch_size, num_beams), dtype=tf.bool)
+
+        # per batch, beam-item score, logprobs
+        running_scores = tf.tile(
+            tf.expand_dims(tf.convert_to_tensor([0.0] + [-1.0e9] * (num_beams - 1)), axis=0), [batch_size, 1]
+        )
+        scores = tf.ones((batch_size, num_beams)) * -1.0e9
+
+        # per batch beam indices
+        running_beam_indices = tf.ones((batch_size, num_beams, max_length - decoder_prompt_len), dtype=tf.int32) * -1
+        beam_indices = tf.ones((batch_size, num_beams, max_length - decoder_prompt_len), dtype=tf.int32) * -1
+
+        # flatten beam dim
+        if "encoder_outputs" in model_kwargs:
+            model_kwargs["encoder_outputs"]["last_hidden_state"] = flatten_beam_dim(
+                model_kwargs["encoder_outputs"]["last_hidden_state"]
+            )
+        if "attention_mask" in model_kwargs:
+            model_kwargs["attention_mask"] = flatten_beam_dim(model_kwargs["attention_mask"])
+
+        # 4. define "xla-compile-able" stop-condition and auto-regressive function
+        # define stop-condition and auto-regressive function
+        def beam_search_cond_fn(
+            cur_len,
+            running_sequences,
+            running_scores,
+            running_beam_indices,
+            sequences,
+            scores,
+            beam_indices,
+            is_sent_finished,
+            decoder_prompt_len,
+            model_kwargs,
+        ):
+            """
+            Beam Search termination condition function -- halts the generation loop if any of these conditions becomes
+            False
+            """
+            # 1. is less than max length?
+            not_max_length_yet = cur_len < max_length
+
+            # 2. can the new beams still improve?
+            # early_stopping == False -> apply heuristic = always get the best score from `cur_len - decoder_prompt_len`. See the discussion
+            # below for more details.
+            # https://github.com/huggingface/transformers/pull/20901#issuecomment-1369845565
+            # early_stopping == "never" -> compute the best score from max_length or cur_len, depending on the sign of
+            #   length_penalty. Positive length_penalty favors longer sequences, thus we use max_length there.
+            if early_stopping == "never" and length_penalty > 0.0:
+                best_running_score = running_scores[:, :1] / ((max_length - decoder_prompt_len) ** length_penalty)
+            else:
+                best_running_score = running_scores[:, :1] / (
+                    tf.cast(cur_len - decoder_prompt_len, dtype=tf.float32) ** length_penalty
+                )
+            worst_finished_score = tf.where(
+                is_sent_finished, tf.math.reduce_min(scores, axis=1, keepdims=True), -1.0e9
+            )
+            improvement_still_possible = tf.math.reduce_any(best_running_score > worst_finished_score)
+
+            # 3. is there still a beam that has not finished?
+            still_open_beam = ~(tf.math.reduce_all(is_sent_finished) & (early_stopping is True))
+
+            return not_max_length_yet & still_open_beam & improvement_still_possible
+
+        def beam_search_body_fn(
+            cur_len,
+            running_sequences,
+            running_scores,
+            running_beam_indices,
+            sequences,
+            scores,
+            beam_indices,
+            is_sent_finished,
+            decoder_prompt_len,
+            model_kwargs,
+        ):
+            """
+            Beam Search iterative update function -- each iteration adds a new token and updates the best sequences
+            seen so far
+            """
+            # 1. Forward current tokens
+            if model_kwargs.get("past_key_values") is None or needs_full_input:
+                input_ids = running_sequences[:, :, :cur_len]
+            else:
+                input_ids = tf.expand_dims(running_sequences[:, :, cur_len - 1], -1)
+            model_inputs = self.prepare_inputs_for_generation(
+                flatten_beam_dim(input_ids), use_cache=use_cache, **model_kwargs
+            )
+            model_outputs = self(
+                **model_inputs,
+                return_dict=True,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+            )
+            logits = unflatten_beam_dim(model_outputs.logits[:, -1], num_beams)
+
+            # 2. Compute log probs
+            # get log probabilities from logits, process logits with processors (*e.g.* min_length, ...), and
+            # add new logprobs to existing running logprobs scores.
+            log_probs = tf.nn.log_softmax(logits)
+            log_probs = logits_processor(flatten_beam_dim(running_sequences), flatten_beam_dim(log_probs), cur_len)
+            log_probs = unflatten_beam_dim(log_probs, num_beams)
+            if do_sample:
+                log_probs = logits_warper(flatten_beam_dim(running_sequences), flatten_beam_dim(log_probs), cur_len)
+                log_probs = unflatten_beam_dim(log_probs, num_beams)
+            log_probs_processed = log_probs
+            log_probs = log_probs + tf.expand_dims(running_scores, axis=2)
+            vocab_size = log_probs.shape[2]
+            log_probs = tf.reshape(log_probs, (batch_size, num_beams * vocab_size))
+
+            # Store scores, attentions and hidden_states when required
+            if not use_xla and return_dict_in_generate:
+                if output_scores:
+                    all_scores.append(
+                        logits_warper(
+                            flatten_beam_dim(running_sequences),
+                            flatten_beam_dim(log_probs_processed),
+                            cur_len,
+                        )
+                    )
+                if output_attentions and self.config.is_encoder_decoder:
+                    decoder_attentions.append(model_outputs.decoder_attentions)
+                elif output_attentions and not self.config.is_encoder_decoder:
+                    decoder_attentions.append(model_outputs.attentions)
+                    if self.config.is_encoder_decoder:
+                        cross_attentions.append(model_outputs.cross_attentions)
+
+                if output_hidden_states and self.config.is_encoder_decoder:
+                    decoder_hidden_states.append(model_outputs.decoder_hidden_states)
+                elif output_hidden_states and self.config.is_encoder_decoder:
+                    decoder_hidden_states.append(model_outputs.hidden_states)
+
+            # 3. Retrieve top-K
+            # Each item in batch has num_beams * vocab_size candidate sequences. For each item, get the top 2*k
+            # candidates with the highest log-probabilities. We gather the top 2*K beams here so that even if the
+            # best K sequences reach EOS simultaneously, we have another K sequences remaining to continue the live
+            # beam search.
+            # Gather the top 2*K scores from _all_ beams.
+            # Gather 2*k top beams.
+            # Recover the beam index by floor division.
+            # Recover token id by modulo division and expand Id array for broadcasting.
+            # Update sequences for the 2*K top-k new sequences.
+            beams_to_keep = 2 * num_beams
+            if do_sample:
+                topk_indices = sample_without_replacement(log_probs, beams_to_keep)
+                topk_log_probs = tf.gather(log_probs, topk_indices, axis=1, batch_dims=1)
+            else:
+                topk_log_probs, topk_indices = tf.math.top_k(log_probs, k=beams_to_keep)
+            topk_current_beam_indices = topk_indices // vocab_size
+            topk_running_beam_indices = self._gather_beams(running_beam_indices, topk_current_beam_indices)
+            topk_running_sequences = self._gather_beams(running_sequences, topk_current_beam_indices)
+            topk_ids = topk_indices % vocab_size
+
+            # writes the new token
+            indices_batch = tf.repeat(tf.range(batch_size), [beams_to_keep])
+            indices_beam = tf.tile(tf.range(beams_to_keep), [batch_size])
+            update_indices = tf.stack(
+                [indices_batch, indices_beam, tf.broadcast_to(cur_len, [batch_size * beams_to_keep])], axis=-1
+            )
+            topk_sequences = tf.tensor_scatter_nd_update(
+                tensor=topk_running_sequences,
+                indices=update_indices,
+                updates=tf.reshape(topk_ids, [batch_size * beams_to_keep]),
+            )
+
+            # we want to store the beam indices with batch information -> real beam index = beam index % num beams
+            batch_modified_indices = topk_current_beam_indices + tf.broadcast_to(
+                tf.expand_dims(tf.range(batch_size) * num_beams, axis=1), topk_current_beam_indices.shape
+            )
+            update_indices = tf.stack(
+                [
+                    indices_batch,
+                    indices_beam,
+                    tf.broadcast_to(cur_len - decoder_prompt_len, [batch_size * beams_to_keep]),
+                ],
+                axis=-1,
+            )
+            topk_beam_indices = tf.tensor_scatter_nd_update(
+                tensor=topk_running_beam_indices,
+                indices=update_indices,
+                updates=tf.reshape(batch_modified_indices, [batch_size * beams_to_keep]),
+            )
+
+            # 4. Check which sequences have ended
+            # Update current sequences: Did the top `num_beams` sequences reach an end marker?
+            # To prevent these just finished sequences from being added to the current sequences
+            # set of active beam search sequences, set their log probs to a very large negative value.
+            if eos_token_id is None:
+                eos_in_next_token = tf.zeros(topk_sequences[:, :, cur_len].shape, dtype=tf.bool)
+            else:
+                eos_in_next_token = tf.math.reduce_any(
+                    tf.equal(
+                        tf.broadcast_to(
+                            topk_sequences[:, :, cur_len],
+                            [len(eos_token_id)] + topk_sequences[:, :, cur_len].shape,
+                        ),
+                        tf.expand_dims(tf.expand_dims(eos_token_id, -1), -1),
+                    ),
+                    axis=0,
+                )
+            did_topk_just_finished = eos_in_next_token & tf.broadcast_to(
+                tf.concat((tf.ones((num_beams), dtype=tf.bool), tf.zeros((num_beams), dtype=tf.bool)), axis=0),
+                shape_list(eos_in_next_token),
+            )
+
+            # non-top `num_beams` eos tokens can't be used to finish a beam, but the others can't be used in the next
+            # running sentences either
+            running_topk_log_probs = topk_log_probs + tf.cast(eos_in_next_token, tf.float32) * -1.0e9
+
+            # 5. Get running sequences scores for next
+            # Determine the top k beam indices (from top 2*k beams) from log probs and gather top k beams
+            # (from top 2*k beams).
+            next_topk_indices = tf.math.top_k(running_topk_log_probs, k=num_beams)[1]
+            next_running_sequences, next_running_scores, next_running_beam_indices = self._gather_beams(
+                [topk_sequences, running_topk_log_probs, topk_beam_indices], next_topk_indices
+            )
+
+            # 6. Process topk logits
+            # Further process log probs:
+            # - add length penalty
+            # - make sure no scores can be added anymore if beam is full
+            # - make sure still running sequences cannot be chosen as finalized beam
+            topk_log_probs = topk_log_probs / (
+                tf.cast(cur_len + 1 - decoder_prompt_len, dtype=tf.float32) ** length_penalty
+            )
+            beams_in_batch_are_full = tf.broadcast_to(
+                tf.math.reduce_all(is_sent_finished, axis=-1, keepdims=True), shape_list(did_topk_just_finished)
+            ) & (early_stopping is True)
+            add_penalty = ~did_topk_just_finished | beams_in_batch_are_full
+            topk_log_probs += tf.cast(add_penalty, tf.float32) * -1.0e9
+
+            # 7. Get scores, sequences, is sentence finished for next.
+            # Combine sequences, scores, and flags along the beam dimension and compare new finished sequence scores
+            # to existing finished scores and select the best from the new set of beams
+            merged_sequences = tf.concat([sequences, topk_sequences], axis=1)
+            merged_scores = tf.concat([scores, topk_log_probs], axis=1)
+            merged_beams = tf.concat([beam_indices, topk_beam_indices], axis=1)
+            merged_is_sent_finished = tf.concat([is_sent_finished, did_topk_just_finished], axis=1)
+            topk_merged_indices = tf.math.top_k(merged_scores, k=num_beams)[1]
+            next_sequences, next_scores, next_beam_indices, next_is_sent_finished = self._gather_beams(
+                [merged_sequences, merged_scores, merged_beams, merged_is_sent_finished], topk_merged_indices
+            )
+
+            # 8. Prepare data for the next iteration
+            # Determine the top k beam indices from the original set of all beams. With these, gather the top k
+            # beam-associated caches.
+            cur_len = cur_len + 1
+            if "past_key_values" in model_outputs:
+                cache = tf.nest.map_structure(
+                    lambda tensor: unflatten_beam_dim(tensor, num_beams, batch_axis=cache_batch_axis),
+                    model_outputs.past_key_values,
+                )
+                next_running_indices = self._gather_beams(topk_current_beam_indices, next_topk_indices)
+                next_cache = self._gather_beams(cache, next_running_indices, batch_axis=cache_batch_axis)
+                model_outputs["past_key_values"] = tf.nest.map_structure(
+                    lambda tensor: flatten_beam_dim(tensor, batch_axis=cache_batch_axis), next_cache
+                )
+
+            if use_xla:
+                next_model_kwargs = self._update_model_kwargs_for_xla_generation(
+                    model_outputs=model_outputs,
+                    model_kwargs=model_kwargs,
+                    cur_len=cur_len,
+                    max_length=max_length,
+                    batch_size=(batch_size * num_beams),
+                    is_encoder_decoder=self.config.is_encoder_decoder,
+                    batch_axis=cache_batch_axis,
+                )
+            else:
+                next_model_kwargs = self._update_model_kwargs_for_generation(
+                    model_outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder
+                )
+
+                # if we don't cache past_key_values key values we need the whole input
+                if model_kwargs.get("past_key_values", None) is None:
+                    # let's throw out `past_key_values` since we don't want `None` tensors
+                    model_kwargs.pop("past_key_values", None)
+
+            return (
+                cur_len,
+                next_running_sequences,
+                next_running_scores,
+                next_running_beam_indices,
+                next_sequences,
+                next_scores,
+                next_beam_indices,
+                next_is_sent_finished,
+                decoder_prompt_len,
+                next_model_kwargs,
+            )
+
+        # 5. run generation
+        # 1st generation step has to be run before to initialize `past_key_values` (if active)
+        (
+            cur_len,
+            running_sequences,
+            running_scores,
+            running_beam_indices,
+            sequences,
+            scores,
+            beam_indices,
+            is_sent_finished,
+            decoder_prompt_len,
+            model_kwargs,
+        ) = beam_search_body_fn(
+            cur_len,
+            running_sequences,
+            running_scores,
+            running_beam_indices,
+            sequences,
+            scores,
+            beam_indices,
+            is_sent_finished,
+            decoder_prompt_len,
+            model_kwargs,
+        )
+
+        # 2-to-n generation steps can then be run in autoregressive fashion (only in case 1st generation step does
+        # NOT yield EOS token though)
+        maximum_iterations = max_length - cur_len
+        (
+            cur_len,
+            running_sequences,
+            running_scores,
+            running_beam_indices,
+            sequences,
+            scores,
+            beam_indices,
+            is_sent_finished,
+            decoder_prompt_len,
+            _,
+        ) = tf.while_loop(
+            beam_search_cond_fn,
+            beam_search_body_fn,
+            (
+                cur_len,
+                running_sequences,
+                running_scores,
+                running_beam_indices,
+                sequences,
+                scores,
+                beam_indices,
+                is_sent_finished,
+                decoder_prompt_len,
+                model_kwargs,
+            ),
+            maximum_iterations=maximum_iterations,
+        )
+
+        # 6. prepare outputs
+        # Account for the edge-case where there are no finished sequences for a particular batch item. If so, return
+        # running sequences for that batch item.
+        none_finished = tf.math.reduce_any(is_sent_finished, axis=1)
+        sequences = tf.where(none_finished[:, None, None], sequences, running_sequences)
+        beam_indices = tf.where(none_finished[:, None, None], beam_indices, running_beam_indices)
+
+        # Apply the length penalty so that running scores match the finalized scores if they are used
+        running_scores = running_scores / (tf.cast(cur_len - decoder_prompt_len, dtype=tf.float32) ** length_penalty)
+        scores = tf.where(none_finished[:, None], scores, running_scores)
+
+        # Take best beams for each batch (the score is sorted in descending order)
+        sequences = flatten_beam_dim(sequences[:, :num_return_sequences, :])
+        scores = flatten_beam_dim(scores[:, :num_return_sequences])
+        beam_indices = flatten_beam_dim(beam_indices[:, :num_return_sequences, :])
+
+        if not use_xla:
+            # Cut for backward compatibility
+            sequences = sequences[:, :cur_len]
+            beam_indices = beam_indices[:, : cur_len - decoder_prompt_len]
+
+        if return_dict_in_generate:
+            if self.config.is_encoder_decoder:
+                # if model is an encoder-decoder, retrieve encoder attention weights and hidden states
+                encoder_attentions = model_kwargs["encoder_outputs"].get("attentions") if output_attentions else None
+                encoder_hidden_states = (
+                    model_kwargs["encoder_outputs"].get("hidden_states") if output_hidden_states else None
+                )
+
+                output_cls = TFBeamSampleEncoderDecoderOutput if do_sample else TFBeamSearchEncoderDecoderOutput
+                return output_cls(
+                    sequences=sequences,
+                    sequences_scores=scores,
+                    scores=all_scores,
+                    beam_indices=beam_indices,
+                    encoder_attentions=encoder_attentions,
+                    encoder_hidden_states=encoder_hidden_states,
+                    decoder_attentions=decoder_attentions,
+                    cross_attentions=cross_attentions,
+                    decoder_hidden_states=decoder_hidden_states,
+                )
+            else:
+                output_cls = TFBeamSampleDecoderOnlyOutput if do_sample else TFBeamSearchDecoderOnlyOutput
+                return output_cls(
+                    sequences=sequences,
+                    sequences_scores=scores,
+                    scores=all_scores,
+                    beam_indices=beam_indices,
+                    attentions=decoder_attentions,
+                    hidden_states=decoder_hidden_states,
+                )
+        else:
+            return sequences
+
+    def contrastive_search(
+        self,
+        input_ids: tf.Tensor,
+        top_k: Optional[int] = 1,
+        penalty_alpha: Optional[float] = 0,
+        logits_processor: Optional[TFLogitsProcessorList] = None,
+        logits_warper: Optional[TFLogitsProcessorList] = None,
+        max_length: Optional[int] = None,
+        pad_token_id: Optional[int] = None,
+        eos_token_id: Optional[int] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_scores: Optional[bool] = None,
+        return_dict_in_generate: Optional[bool] = None,
+        **model_kwargs,
+    ) -> Union[TFContrastiveSearchOutput, tf.Tensor]:
+        r"""
+        Generates sequences of token ids for models with a language modeling head using **contrastive search** and can
+        be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models.
+
+        Parameters:
+            input_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`):
+                The sequence used as a prompt for the generation.
+            top_k (`int`, *optional*, defaults to 1):
+                The size of the candidate set that is used to re-rank for contrastive search
+            penalty_alpha (`float`, *optional*, defaults to 0):
+                The degeneration penalty for contrastive search; activate when it is larger than 0
+            logits_processor (`TFLogitsProcessorList`, *optional*):
+                An instance of [`TFLogitsProcessorList`]. List of instances of class derived from [`TFLogitsProcessor`]
+                used to modify the prediction scores of the language modeling head applied at each generation step.
+            logits_warper (`TFLogitsProcessorList`, *optional*):
+                An instance of [`TFLogitsProcessorList`]. List of instances of class derived from [`TFLogitsWarper`]
+                used to warp the prediction score distribution of the language modeling head applied before multinomial
+                sampling at each generation step.
+            max_length (`int`, *optional*, defaults to 20):
+                The maximum length of the sequence to be generated.
+            pad_token_id (`int`, *optional*):
+                The id of the *padding* token.
+            eos_token_id (`Union[int, List[int]]`, *optional*):
+                The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
+            output_attentions (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more details.
+            output_hidden_states (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more details.
+            output_scores (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the prediction scores. See `scores` under returned tensors for more details.
+            return_dict_in_generate (`bool`, *optional*, defaults to `False`):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+            model_kwargs:
+                Additional model specific keyword arguments will be forwarded to the `call` function of the model. If
+                model is an encoder-decoder model the kwargs should include `encoder_outputs`.
+        Return:
+            [`~generation.TFContrastiveSearchDecoderOnlyOutput`],
+            [`~generation.TFContrastiveSearchEncoderDecoderOutput`] or `tf.Tensor`: A `tf.Tensor` containing the
+            generated tokens (default behaviour) or a [`~generation.TFContrastiveySearchDecoderOnlyOutput`] if
+            `model.config.is_encoder_decoder=False` and `return_dict_in_generate=True` or a
+            [`~generation.TFContrastiveSearchEncoderDecoderOutput`] if `model.config.is_encoder_decoder=True`.
+        Examples:
+        ```python
+        >>> from transformers import AutoTokenizer, TFAutoModelForCausalLM
+
+        >>> tokenizer = AutoTokenizer.from_pretrained("facebook/opt-125m")
+        >>> model = TFAutoModelForCausalLM.from_pretrained("facebook/opt-125m")
+        >>> # set pad_token_id to eos_token_id because OPT does not have a PAD token
+        >>> model.config.pad_token_id = model.config.eos_token_id
+        >>> input_prompt = "DeepMind Company is"
+        >>> input_ids = tokenizer(input_prompt, return_tensors="tf")
+        >>> outputs = model.contrastive_search(**input_ids, penalty_alpha=0.6, top_k=4, max_length=64)
+        >>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
+        ['DeepMind Company is a company that focuses on the development and commercialization of artificial intelligence (AI). DeepMind’s mission is to help people understand and solve problems that are difficult to solve in the world today.\n\nIn this post, we talk about the benefits of deep learning in business and how it']
+        ```"""
+
+        def gather_best_candidate(nested, selected_idx_stacked, batch_axis=0):
+            """Gathers the slices indexed by selected_idx_stacked from a potentially nested structure of tensors."""
+
+            def gather_fn(tensor):
+                gathered_tensor = tf.gather(params=tensor, indices=selected_idx_stacked, axis=batch_axis)
+                return gathered_tensor
+
+            return tf.nest.map_structure(gather_fn, nested)
+
+        # 1. init greedy_search values
+        logits_processor = logits_processor if logits_processor is not None else TFLogitsProcessorList()
+        logits_warper = logits_warper if logits_warper is not None else TFLogitsProcessorList()
+        max_length = max_length if max_length is not None else self.generation_config.max_length
+        pad_token_id = pad_token_id if pad_token_id is not None else self.generation_config.pad_token_id
+        eos_token_id = eos_token_id if eos_token_id is not None else self.generation_config.eos_token_id
+        if isinstance(eos_token_id, int):
+            eos_token_id = [eos_token_id]
+        output_scores = output_scores if output_scores is not None else self.generation_config.output_scores
+        output_attentions = (
+            output_attentions if output_attentions is not None else self.generation_config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.generation_config.output_hidden_states
+        )
+        return_dict_in_generate = (
+            return_dict_in_generate
+            if return_dict_in_generate is not None
+            else self.generation_config.return_dict_in_generate
+        )
+        use_cache = True  # In contrastive search, we always use cache
+        model_kwargs.pop("use_cache", None)
+
+        use_xla = not tf.executing_eagerly()
+        # TODO (Joao): fix cache format or find programatic way to detect cache index
+        # GPT2 and other models has a slightly different cache structure, with a different batch axis
+        model_name = str(self.decoder) if "EncoderDecoder" in str(self) else str(self)
+        cache_batch_axis = 1 if any(model_prefix in model_name for model_prefix in ("TFGPT2", "TFCTRL")) else 0
+
+        # 2. init `attentions`, `hidden_states`, and `scores` tuples
+        scores = [] if (return_dict_in_generate and output_scores) else None
+        decoder_attentions = [] if (return_dict_in_generate and output_attentions) else None
+        cross_attentions = [] if (return_dict_in_generate and output_attentions) else None
+        decoder_hidden_states = [] if (return_dict_in_generate and output_hidden_states) else None
+
+        # 3. init tensors to use for "xla-compileable" generate function
+        batch_size, cur_len = shape_list(input_ids)
+
+        # initialize `generated` (`input_ids` padded with `pad_token_id`), `finished_sequences`
+        input_ids_padding = tf.ones((batch_size, max_length - cur_len), dtype=tf.int32) * (pad_token_id or 0)
+        generated = tf.concat([input_ids, input_ids_padding], axis=-1)
+        finished_sequences = tf.zeros((batch_size,), dtype=tf.bool)
+
+        # 4. define "xla-compile-able" stop-condition and auto-regressive function
+        # define condition fn
+        def contrastive_search_cond_fn(
+            generated, finished_sequences, cur_len, model_kwargs, next_step_cached_variables
+        ):
+            """state termination condition fn."""
+            return ~tf.reduce_all(finished_sequences)
+
+        # define condition fn
+        def contrastive_search_body_fn(
+            generated, finished_sequences, cur_len, model_kwargs, next_step_cached_variables
+        ):
+            """state update fn."""
+
+            # if the first step in the loop, encode all the prefix and obtain: (1) past_key_values;
+            # (2) last_hidden_states; (3) logit_for_next_step; (4) update model kwargs for the next step
+            if model_kwargs.get("past_key_values") is None:
+                # prepare inputs
+                model_inputs = self.prepare_inputs_for_generation(
+                    generated[:, :cur_len], use_cache=use_cache, **model_kwargs
+                )
+
+                # encode the given prefix and prepare model inputs; encoder-decoder model process the prefix and save
+                # the `encoder_outputs`
+                outputs = self(
+                    **model_inputs, return_dict=True, output_hidden_states=True, output_attentions=output_attentions
+                )
+
+                # last decoder hidden states will be used to compute the degeneration penalty (cosine similarity with
+                # previous tokens)
+                if self.config.is_encoder_decoder:
+                    last_hidden_states = outputs.decoder_hidden_states[-1]
+                else:
+                    last_hidden_states = outputs.hidden_states[-1]
+
+                # XLA: last_hidden_states normally grows at each step, but in XLA it is padded so as to be used across
+                # iterations (with fixed shapes)
+                if use_xla:
+                    last_hidden_states = tf.pad(last_hidden_states, [[0, 0], [0, max_length - cur_len], [0, 0]])
+
+                # next logit for contrastive search to select top-k candidate tokens
+                logit_for_next_step = outputs.logits[:, -1, :]
+
+                if use_xla:
+                    model_kwargs = self._update_model_kwargs_for_xla_generation(
+                        model_outputs=outputs,
+                        model_kwargs=model_kwargs,
+                        cur_len=cur_len,
+                        max_length=max_length,
+                        batch_size=batch_size,
+                        is_encoder_decoder=self.config.is_encoder_decoder,
+                        batch_axis=cache_batch_axis,
+                    )
+                else:
+                    model_kwargs = self._update_model_kwargs_for_generation(
+                        outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder
+                    )
+
+                # Expands model inputs top_k times, for batched forward passes (akin to beam search).
+                _, model_kwargs = self._expand_inputs_for_generation(
+                    expand_size=top_k, is_encoder_decoder=self.config.is_encoder_decoder, **model_kwargs
+                )
+
+                past_key_values = model_kwargs.get("past_key_values")
+                if past_key_values is None:
+                    raise ValueError(
+                        f"{self.__class__.__name__} does not support caching and therefore **can't** be used "
+                        "for contrastive search."
+                    )
+                elif (
+                    not isinstance(past_key_values[0], (tuple, tf.Tensor))
+                    or past_key_values[0][0].shape[0] != batch_size
+                ):
+                    raise ValueError(
+                        f"{self.__class__.__name__} does not have a standard cache format and therefore **can't** be "
+                        "used for contrastive search without further modifications."
+                    )
+            else:
+                logit_for_next_step = next_step_cached_variables["logit_for_next_step"]
+                last_hidden_states = next_step_cached_variables["last_hidden_states"]
+                outputs = next_step_cached_variables["outputs"]
+
+            # contrastive_search main logic start:
+            # contrastive search decoding consists of two steps: (1) candidate tokens recall; (2) candidate re-rank by
+            # degeneration penalty
+
+            logit_for_next_step = logits_processor(generated, logit_for_next_step, cur_len)
+            logit_for_next_step = logits_warper(generated, logit_for_next_step, cur_len)
+            next_probs = stable_softmax(logit_for_next_step, axis=-1)
+            top_k_probs, top_k_ids = tf.math.top_k(next_probs, k=top_k)
+
+            # Store scores, attentions and hidden_states when required
+            if not use_xla and return_dict_in_generate:
+                if output_scores:
+                    scores.append(logit_for_next_step)
+                if output_attentions and self.config.is_encoder_decoder:
+                    decoder_attentions.append(outputs.decoder_attentions)
+                elif output_attentions and not self.config.is_encoder_decoder:
+                    decoder_attentions.append(outputs.attentions)
+                    if self.config.is_encoder_decoder:
+                        cross_attentions.append(outputs.cross_attentions)
+
+                if output_hidden_states and self.config.is_encoder_decoder:
+                    decoder_hidden_states.append(outputs.decoder_hidden_states)
+                elif output_hidden_states and self.config.is_encoder_decoder:
+                    decoder_hidden_states.append(outputs.hidden_states)
+
+            # Replicates the new past_key_values to match the `top_k` candidates
+            model_kwargs["past_key_values"] = tf.nest.map_structure(
+                lambda tensor: tf.repeat(tensor, top_k, axis=cache_batch_axis), model_kwargs["past_key_values"]
+            )
+
+            # compute the candidate tokens by the language model and collects their hidden_states
+            next_model_inputs = self.prepare_inputs_for_generation(
+                tf.reshape(top_k_ids, [-1, 1]), use_cache=use_cache, **model_kwargs
+            )
+            outputs = self(
+                **next_model_inputs, return_dict=True, output_hidden_states=True, output_attentions=output_attentions
+            )
+            next_past_key_values = self._extract_past_from_model_output(outputs)
+
+            logits = outputs.logits[:, -1, :]
+            # name is different for encoder-decoder and decoder-only models
+            if self.config.is_encoder_decoder:
+                next_hidden = outputs.decoder_hidden_states[-1]
+                full_hidden_states = outputs.decoder_hidden_states
+            else:
+                next_hidden = outputs.hidden_states[-1]
+                full_hidden_states = outputs.hidden_states
+            context_hidden = tf.repeat(last_hidden_states[:, :cur_len, :], top_k, axis=0)
+
+            # compute the degeneration penalty and re-rank the candidates based on the degeneration penalty and the
+            # model confidence
+            selected_idx = _ranking_fast(context_hidden, next_hidden, top_k_probs, penalty_alpha, top_k)
+
+            # converts indices to a dimension of top_k to the stacked top_k * batch_size dimension, for indexing
+            # without a need to reshape on tensors that have these two dimensions stacked
+            selected_idx_stacked = selected_idx + tf.range(selected_idx.shape[0], dtype=tf.int64) * top_k
+
+            # prepare for the next step: (1) next token_id; (2) past_key_values; (3) last_hidden_states for computing
+            # the degeneration penalty; (4) logits for selecting next top-k candidates; (5) selected tokens scores
+            # (model confidence minus degeneration penalty); (6) decoder hidden_states
+            next_tokens = tf.gather(top_k_ids, selected_idx, axis=1, batch_dims=1)
+            next_hidden = gather_best_candidate(next_hidden, selected_idx_stacked)
+
+            # XLA: last_hidden_states normally grows at each step, but in XLA it is padded so as to be used across
+            # iterations (with fixed shapes)
+            if use_xla:
+                last_hidden_states = dynamic_update_slice(last_hidden_states, next_hidden, [0, cur_len, 0])
+            else:
+                last_hidden_states = tf.concat([last_hidden_states, next_hidden], axis=1)
+
+            next_decoder_hidden_states = gather_best_candidate(full_hidden_states, selected_idx_stacked)
+            next_past_key_values = gather_best_candidate(
+                next_past_key_values, selected_idx_stacked, batch_axis=cache_batch_axis
+            )
+            logit_for_next_step = gather_best_candidate(logits, selected_idx_stacked)
+
+            # Rebuilds the relevant parts of the model output for the selected token, for use in the next iteration
+            if self.config.is_encoder_decoder:
+                next_step_cross_attentions = ()
+                next_step_decoder_attentions = ()
+                if output_attentions:
+                    next_step_cross_attentions = gather_best_candidate(outputs.cross_attentions, selected_idx_stacked)
+                    next_step_decoder_attentions = gather_best_candidate(
+                        outputs.decoder_attentions, selected_idx_stacked
+                    )
+                outputs = TFSeq2SeqLMOutput(
+                    past_key_values=next_past_key_values,
+                    decoder_hidden_states=next_decoder_hidden_states,
+                    decoder_attentions=next_step_decoder_attentions or None,
+                    cross_attentions=next_step_cross_attentions or None,
+                )
+            else:
+                next_step_attentions = ()
+                if output_attentions:
+                    next_step_attentions = gather_best_candidate(outputs.attentions, selected_idx_stacked)
+                outputs = TFCausalLMOutputWithPast(
+                    past_key_values=next_past_key_values,
+                    hidden_states=next_decoder_hidden_states,
+                    attentions=next_step_attentions or None,
+                )
+            # contrastive_search main logic end
+
+            if eos_token_id is not None:
+                if pad_token_id is None:
+                    raise ValueError("If `eos_token_id` is defined, make sure that `pad_token_id` is defined.")
+                unfinished_seq = 1 - tf.cast(finished_sequences, tf.int32)
+                next_tokens = next_tokens * unfinished_seq + pad_token_id * (1 - unfinished_seq)
+                next_token_is_eos = tf.math.reduce_any(
+                    tf.equal(
+                        tf.broadcast_to(next_tokens, (len(eos_token_id), batch_size)), tf.expand_dims(eos_token_id, -1)
+                    ),
+                    axis=0,
+                )
+                finished_sequences = finished_sequences | next_token_is_eos
+
+            # update `generated` and `cur_len`
+            update_indices = tf.stack([tf.range(batch_size), tf.broadcast_to(cur_len, [batch_size])], axis=-1)
+            generated = tf.tensor_scatter_nd_update(tensor=generated, indices=update_indices, updates=next_tokens)
+            cur_len += 1
+
+            if use_xla:
+                # NOTE: 1) relative to other generation strategies, contrastive search is always running forward
+                # passes one step ahead -- hence the `cur_len=cur_len + 1`; 2) the attention mask here is expanded from
+                # [batch_size, ...] to [batch_size*top_k, ...] -- hence the `batch_size=batch_size * top_k`
+                model_kwargs = self._update_model_kwargs_for_xla_generation(
+                    model_outputs=outputs,
+                    model_kwargs=model_kwargs,
+                    cur_len=cur_len + 1,
+                    max_length=max_length,
+                    batch_size=batch_size * top_k,
+                    is_encoder_decoder=self.config.is_encoder_decoder,
+                    batch_axis=cache_batch_axis,
+                )
+            else:
+                model_kwargs = self._update_model_kwargs_for_generation(
+                    outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder
+                )
+
+            next_step_cached_variables = {
+                "logit_for_next_step": logit_for_next_step,
+                "last_hidden_states": last_hidden_states,
+                "outputs": outputs,
+            }
+            return generated, finished_sequences, cur_len, model_kwargs, next_step_cached_variables
+
+        # 5. run generation
+        # 1st generation step has to be run before to initialize `past_key_values`
+        generated, finished_sequences, cur_len, model_kwargs, next_step_cached_variables = contrastive_search_body_fn(
+            generated, finished_sequences, cur_len, model_kwargs, None
+        )
+
+        # 2-to-n generation steps can then be run in autoregressive fashion
+        # only in case 1st generation step does NOT yield EOS token though
+        maximum_iterations = max_length - cur_len
+        generated, _, cur_len, _, _ = tf.while_loop(
+            contrastive_search_cond_fn,
+            contrastive_search_body_fn,
+            (generated, finished_sequences, cur_len, model_kwargs, next_step_cached_variables),
+            maximum_iterations=maximum_iterations,
+        )
+
+        # 6. prepare outputs
+        if not use_xla:
+            # cut for backward compatibility
+            generated = generated[:, :cur_len]
+
+        if return_dict_in_generate:
+            if self.config.is_encoder_decoder:
+                # if model is an encoder-decoder, retrieve encoder attention weights
+                # and hidden states
+                encoder_attentions = model_kwargs["encoder_outputs"].get("attentions") if output_attentions else None
+                encoder_hidden_states = (
+                    model_kwargs["encoder_outputs"].get("hidden_states") if output_hidden_states else None
+                )
+
+                scores = tuple(scores) if scores is not None else None
+                decoder_attentions = tuple(decoder_attentions) if decoder_attentions is not None else None
+                cross_attentions = tuple(cross_attentions) if cross_attentions is not None else None
+                decoder_hidden_states = tuple(decoder_hidden_states) if decoder_hidden_states is not None else None
+
+                return TFContrastiveSearchEncoderDecoderOutput(
+                    sequences=generated,
+                    scores=scores,
+                    encoder_attentions=encoder_attentions,
+                    encoder_hidden_states=encoder_hidden_states,
+                    decoder_attentions=decoder_attentions,
+                    cross_attentions=cross_attentions,
+                    decoder_hidden_states=decoder_hidden_states,
+                )
+            else:
+                return TFContrastiveSearchDecoderOnlyOutput(
+                    sequences=generated,
+                    scores=scores,
+                    attentions=decoder_attentions,
+                    hidden_states=decoder_hidden_states,
+                )
+        else:
+            return generated
+
+
+def scatter_values_on_batch_indices(values, batch_indices):
+    shape = shape_list(batch_indices)
+    # broadcast batch dim to shape
+    broad_casted_batch_dims = tf.reshape(tf.broadcast_to(tf.expand_dims(tf.range(shape[0]), axis=-1), shape), [1, -1])
+    # transform batch_indices to pair_indices
+    pair_indices = tf.transpose(tf.concat([broad_casted_batch_dims, tf.reshape(batch_indices, [1, -1])], 0))
+    # scatter values to pair indices
+    return tf.scatter_nd(pair_indices, tf.reshape(values, [-1]), shape)
+
+
+def sample_without_replacement(logits, num_samples):
+    """
+    categorical sampling without replacement is currently not implemented the gumbel-max trick will do for now see
+    https://github.com/tensorflow/tensorflow/issues/9260 for more info
+    """
+    z = -tf.math.log(-tf.math.log(tf.random.uniform(shape_list(logits), 0, 1)))
+    _, indices = tf.nn.top_k(logits + z, num_samples)
+    return indices
+
+
+def _ranking_fast(
+    context_hidden: tf.Tensor,
+    next_hidden: tf.Tensor,
+    next_top_k_probs: tf.Tensor,
+    alpha: float,
+    beam_width: int,
+) -> tf.Tensor:
+    """
+    Reranks the top_k candidates based on a degeneration penalty (cosine similarity with previous tokens), as described
+    in the paper "A Contrastive Framework for Neural Text Generation". Returns the index of the best candidate for each
+    row in the batch.
+    """
+    norm_context_hidden = context_hidden / tf.norm(context_hidden, axis=2, keepdims=True)
+    norm_next_hidden = next_hidden / tf.norm(next_hidden, axis=2, keepdims=True)
+    cosine_matrix = tf.squeeze(tf.linalg.matmul(norm_context_hidden, norm_next_hidden, transpose_b=True), axis=-1)
+    degeneration_penalty = tf.reduce_max(cosine_matrix, axis=-1)
+    next_top_k_probs = tf.reshape(next_top_k_probs, shape=[-1])
+    contrastive_score = (1.0 - alpha) * next_top_k_probs - alpha * degeneration_penalty
+    contrastive_score = tf.reshape(contrastive_score, shape=[-1, beam_width])
+    selected_idx = tf.argmax(contrastive_score, axis=1)
+    return selected_idx
diff --git a/.venv/lib/python3.11/site-packages/transformers/generation/utils.py b/.venv/lib/python3.11/site-packages/transformers/generation/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..18cbab6005762156bc2716cd20179fe439c73608
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/generation/utils.py
@@ -0,0 +1,4792 @@
+# coding=utf-8
+# Copyright 2020 The Google AI Language Team Authors, Facebook AI Research authors and The HuggingFace Inc. team.
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import copy
+import inspect
+import os
+import warnings
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+import torch.distributed as dist
+from torch import nn
+from torch.nn import functional as F
+
+from ..cache_utils import (
+    Cache,
+    DynamicCache,
+    EncoderDecoderCache,
+    OffloadedCache,
+    QuantizedCacheConfig,
+    StaticCache,
+)
+from ..configuration_utils import PretrainedConfig
+from ..integrations.deepspeed import is_deepspeed_zero3_enabled
+from ..integrations.fsdp import is_fsdp_managed_module
+from ..modeling_outputs import CausalLMOutputWithPast, Seq2SeqLMOutput
+from ..pytorch_utils import isin_mps_friendly
+from ..tokenization_utils import ExtensionsTrie
+from ..utils import (
+    ModelOutput,
+    is_accelerate_available,
+    is_hqq_available,
+    is_optimum_quanto_available,
+    is_torchdynamo_compiling,
+    logging,
+)
+from .beam_constraints import DisjunctiveConstraint, PhrasalConstraint
+from .beam_search import BeamScorer, BeamSearchScorer, ConstrainedBeamSearchScorer
+from .candidate_generator import (
+    AssistedCandidateGenerator,
+    AssistedCandidateGeneratorDifferentTokenizers,
+    CandidateGenerator,
+    EarlyExitCandidateGenerator,
+    PromptLookupCandidateGenerator,
+    _crop_past_key_values,
+    _prepare_attention_mask,
+    _prepare_token_type_ids,
+)
+from .configuration_utils import (
+    NEED_SETUP_CACHE_CLASSES_MAPPING,
+    QUANT_BACKEND_CLASSES_MAPPING,
+    GenerationConfig,
+    GenerationMode,
+)
+from .logits_process import (
+    EncoderNoRepeatNGramLogitsProcessor,
+    EncoderRepetitionPenaltyLogitsProcessor,
+    EpsilonLogitsWarper,
+    EtaLogitsWarper,
+    ExponentialDecayLengthPenalty,
+    ForcedBOSTokenLogitsProcessor,
+    ForcedEOSTokenLogitsProcessor,
+    HammingDiversityLogitsProcessor,
+    InfNanRemoveLogitsProcessor,
+    LogitNormalization,
+    LogitsProcessorList,
+    MinLengthLogitsProcessor,
+    MinNewTokensLengthLogitsProcessor,
+    MinPLogitsWarper,
+    NoBadWordsLogitsProcessor,
+    NoRepeatNGramLogitsProcessor,
+    PrefixConstrainedLogitsProcessor,
+    RepetitionPenaltyLogitsProcessor,
+    SequenceBiasLogitsProcessor,
+    SuppressTokensAtBeginLogitsProcessor,
+    SuppressTokensLogitsProcessor,
+    TemperatureLogitsWarper,
+    TopKLogitsWarper,
+    TopPLogitsWarper,
+    TypicalLogitsWarper,
+    UnbatchedClassifierFreeGuidanceLogitsProcessor,
+)
+from .stopping_criteria import (
+    ConfidenceCriteria,
+    EosTokenCriteria,
+    MaxLengthCriteria,
+    MaxTimeCriteria,
+    StoppingCriteria,
+    StoppingCriteriaList,
+    StopStringCriteria,
+)
+
+
+if TYPE_CHECKING:
+    from ..modeling_utils import PreTrainedModel
+    from ..tokenization_utils_base import PreTrainedTokenizerBase
+    from .streamers import BaseStreamer
+
+logger = logging.get_logger(__name__)
+
+if is_accelerate_available():
+    from accelerate.hooks import AlignDevicesHook, add_hook_to_module
+
+
+@dataclass
+class GenerateDecoderOnlyOutput(ModelOutput):
+    """
+    Outputs of decoder-only generation models, when using non-beam methods.
+
+    Args:
+        sequences (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or shorter
+            if all batches finished early due to the `eos_token_id`.
+        scores (`tuple(torch.FloatTensor)` *optional*, returned when `output_scores=True`):
+            Processed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax)
+            at each generation step. Tuple of `torch.FloatTensor` with up to `max_new_tokens` elements (one element for
+            each generated token), with each tensor of shape `(batch_size, config.vocab_size)`.
+        logits (`tuple(torch.FloatTensor)` *optional*, returned when `output_logits=True`):
+            Unprocessed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax)
+            at each generation step. Tuple of `torch.FloatTensor` with up to `max_new_tokens` elements (one element for
+            each generated token), with each tensor of shape `(batch_size, config.vocab_size)`.
+        attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True`):
+            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
+            `torch.FloatTensor` of shape `(batch_size, num_heads, generated_length, sequence_length)`.
+        hidden_states (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_hidden_states=True`):
+            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
+            `torch.FloatTensor` of shape `(batch_size, generated_length, hidden_size)`.
+        past_key_values (`tuple(tuple(torch.FloatTensor)))`, *optional*, returned when `use_cache=True`):
+            Returns the model cache, used to speed up decoding. Different models have a different cache format, check
+            the model's documentation. Usually, a [`~cache_utils.Cache`] instance.
+    """
+
+    sequences: torch.LongTensor = None
+    scores: Optional[Tuple[torch.FloatTensor]] = None
+    logits: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
+    hidden_states: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
+    past_key_values: Optional[Tuple[Tuple[Tuple[torch.FloatTensor]]]] = None
+
+
+@dataclass
+class GenerateEncoderDecoderOutput(ModelOutput):
+    """
+    Outputs of encoder-decoder generation models, when using non-beam methods.
+
+    Args:
+        sequences (`torch.LongTensor` of shape `(batch_size*num_return_sequences, sequence_length)`):
+            The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or shorter
+            if all batches finished early due to the `eos_token_id`.
+        scores (`tuple(torch.FloatTensor)` *optional*, returned when `output_scores=True`):
+            Processed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax)
+            at each generation step. Tuple of `torch.FloatTensor` with up to `max_new_tokens` elements (one element for
+            each generated token), with each tensor of shape `(batch_size, config.vocab_size)`.
+        logits (`tuple(torch.FloatTensor)` *optional*, returned when `output_logits=True`):
+            Unprocessed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax)
+            at each generation step. Tuple of `torch.FloatTensor` with up to `max_new_tokens` elements (one element for
+            each generated token), with each tensor of shape `(batch_size, config.vocab_size)`.
+        encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer of the decoder) of shape `(batch_size, num_heads,
+            sequence_length, sequence_length)`.
+        encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`.
+        decoder_attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True`):
+            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
+            `torch.FloatTensor` of shape `(batch_size, num_heads, generated_length, sequence_length)`.
+        cross_attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True`):
+            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
+            `torch.FloatTensor` of shape `(batch_size, num_heads, generated_length, sequence_length)`.
+        decoder_hidden_states (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_hidden_states=True`):
+            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
+            `torch.FloatTensor` of shape `(batch_size, generated_length, hidden_size)`.
+        past_key_values (`tuple(tuple(torch.FloatTensor)))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Returns the model cache, used to speed up decoding. Different models have a different cache format, check
+            the model's documentation. Usually, a [`~cache_utils.Cache`] instance.
+    """
+
+    sequences: torch.LongTensor = None
+    scores: Optional[Tuple[torch.FloatTensor]] = None
+    logits: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    decoder_attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
+    cross_attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
+    decoder_hidden_states: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
+    past_key_values: Optional[Tuple[Tuple[Tuple[torch.FloatTensor]]]] = None
+
+
+@dataclass
+class GenerateBeamDecoderOnlyOutput(ModelOutput):
+    """
+    Outputs of decoder-only generation models, when using beam methods.
+
+    Args:
+        sequences (`torch.LongTensor` of shape `(batch_size*num_return_sequences, sequence_length)`):
+            The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or shorter
+            if all batches finished early due to the `eos_token_id`.
+        sequences_scores (`torch.FloatTensor` of shape `(batch_size*num_return_sequences)`, *optional*, returned when `output_scores=True`):
+            Final beam scores of the generated `sequences`.
+        scores (`tuple(torch.FloatTensor)` *optional*, returned when `output_scores=True`):
+            Beam transition scores for each vocabulary token at each generation step. Beam transition scores consisting
+            of log probabilities of tokens conditioned on log softmax of previously generated tokens in this beam.
+            Tuple of `torch.FloatTensor` with up to `max_new_tokens` elements (one element for each generated token),
+            with each tensor of shape `(batch_size*num_beams, config.vocab_size)`.
+        logits (`tuple(torch.FloatTensor)` *optional*, returned when `output_logits=True`):
+            Unprocessed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax)
+            at each generation step. Tuple of `torch.FloatTensor` with up to `max_new_tokens` elements (one element for
+            each generated token), with each tensor of shape `(batch_size, config.vocab_size)`.
+        beam_indices (`torch.LongTensor`, *optional*, returned when `output_scores=True`):
+            Beam indices of generated token id at each generation step. `torch.LongTensor` of shape
+            `(batch_size*num_return_sequences, sequence_length)`.
+        attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True`):
+            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
+            `torch.FloatTensor` of shape `(batch_size*num_beams, num_heads, generated_length, sequence_length)`.
+        hidden_states (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_hidden_states=True`):
+            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
+            `torch.FloatTensor` of shape `(batch_size*num_beams*num_return_sequences, generated_length, hidden_size)`.
+        past_key_values (`tuple(tuple(torch.FloatTensor)))`, *optional*, returned when `use_cache=True`):
+            Returns the model cache, used to speed up decoding. Different models have a different cache format, check
+            the model's documentation. Usually, a [`~cache_utils.Cache`] instance.
+    """
+
+    sequences: torch.LongTensor = None
+    sequences_scores: Optional[torch.FloatTensor] = None
+    scores: Optional[Tuple[torch.FloatTensor]] = None
+    logits: Optional[Tuple[torch.FloatTensor]] = None
+    beam_indices: Optional[torch.LongTensor] = None
+    attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
+    hidden_states: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
+    past_key_values: Optional[Tuple[Tuple[Tuple[torch.FloatTensor]]]] = None
+
+
+@dataclass
+class GenerateBeamEncoderDecoderOutput(ModelOutput):
+    """
+    Outputs of encoder-decoder generation models, when using beam methods.
+
+    Args:
+        sequences (`torch.LongTensor` of shape `(batch_size*num_return_sequences, sequence_length)`):
+            The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or shorter
+            if all batches finished early due to the `eos_token_id`.
+        sequences_scores (`torch.FloatTensor` of shape `(batch_size*num_return_sequences)`, *optional*, returned when `output_scores=True`):
+            Final beam scores of the generated `sequences`.
+        scores (`tuple(torch.FloatTensor)` *optional*, returned when `output_scores=True`):
+            Beam transition scores for each vocabulary token at each generation step. Beam transition scores consisting
+            of log probabilities of tokens conditioned on log softmax of previously generated tokens in this beam.
+            Tuple of `torch.FloatTensor` with up to `max_new_tokens` elements (one element for each generated token),
+            with each tensor of shape `(batch_size*num_beams, config.vocab_size)`.
+        logits (`tuple(torch.FloatTensor)` *optional*, returned when `output_logits=True`):
+            Unprocessed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax)
+            at each generation step. Tuple of `torch.FloatTensor` with up to `max_new_tokens` elements (one element for
+            each generated token), with each tensor of shape `(batch_size, config.vocab_size)`.
+        beam_indices (`torch.LongTensor`, *optional*, returned when `output_scores=True`):
+            Beam indices of generated token id at each generation step. `torch.LongTensor` of shape
+            `(batch_size*num_return_sequences, sequence_length)`.
+        encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer of the decoder) of shape `(batch_size, num_heads,
+            sequence_length, sequence_length)`.
+        encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size*num_beams*num_return_sequences, sequence_length, hidden_size)`.
+        decoder_attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True`):
+            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
+            `torch.FloatTensor` of shape `(batch_size*num_beams*num_return_sequences, num_heads, generated_length,
+            sequence_length)`.
+        cross_attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True`):
+            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
+            `torch.FloatTensor` of shape `(batch_size, num_heads, generated_length, sequence_length)`.
+        decoder_hidden_states (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_hidden_states=True`):
+            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
+            `torch.FloatTensor` of shape `(batch_size*num_beams*num_return_sequences, generated_length, hidden_size)`.
+        past_key_values (`tuple(tuple(torch.FloatTensor)))`, *optional*, returned when `use_cache=True`):
+            Returns the model cache, used to speed up decoding. Different models have a different cache format, check
+            the model's documentation. Usually, a [`~cache_utils.Cache`] instance.
+    """
+
+    sequences: torch.LongTensor = None
+    sequences_scores: Optional[torch.FloatTensor] = None
+    scores: Optional[Tuple[torch.FloatTensor]] = None
+    logits: Optional[Tuple[torch.FloatTensor]] = None
+    beam_indices: Optional[torch.LongTensor] = None
+    encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    decoder_attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
+    cross_attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
+    decoder_hidden_states: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
+    past_key_values: Optional[Tuple[Tuple[Tuple[torch.FloatTensor]]]] = None
+
+
+# TODO (joao): remove the equivalent classes and typing shortcuts below in v5
+# Equivalent classes (kept for retrocompatibility purposes)
+GreedySearchDecoderOnlyOutput = GenerateDecoderOnlyOutput
+ContrastiveSearchDecoderOnlyOutput = GenerateDecoderOnlyOutput
+SampleDecoderOnlyOutput = GenerateDecoderOnlyOutput
+
+ContrastiveSearchEncoderDecoderOutput = GenerateEncoderDecoderOutput
+GreedySearchEncoderDecoderOutput = GenerateEncoderDecoderOutput
+SampleEncoderDecoderOutput = GenerateEncoderDecoderOutput
+
+BeamSearchDecoderOnlyOutput = GenerateBeamDecoderOnlyOutput
+BeamSampleDecoderOnlyOutput = GenerateBeamDecoderOnlyOutput
+
+BeamSearchEncoderDecoderOutput = GenerateBeamEncoderDecoderOutput
+BeamSampleEncoderDecoderOutput = GenerateBeamEncoderDecoderOutput
+
+GreedySearchOutput = Union[GreedySearchEncoderDecoderOutput, GreedySearchDecoderOnlyOutput]
+SampleOutput = Union[SampleEncoderDecoderOutput, SampleDecoderOnlyOutput]
+BeamSearchOutput = Union[BeamSearchEncoderDecoderOutput, BeamSearchDecoderOnlyOutput]
+BeamSampleOutput = Union[BeamSampleEncoderDecoderOutput, BeamSampleDecoderOnlyOutput]
+ContrastiveSearchOutput = Union[ContrastiveSearchEncoderDecoderOutput, ContrastiveSearchDecoderOnlyOutput]
+
+# Typing shortcuts
+GenerateNonBeamOutput = Union[GenerateDecoderOnlyOutput, GenerateEncoderDecoderOutput]
+GenerateBeamOutput = Union[GenerateBeamDecoderOnlyOutput, GenerateBeamEncoderDecoderOutput]
+GenerateOutput = Union[GenerateNonBeamOutput, GenerateBeamOutput]
+
+
+class GenerationMixin:
+    """
+    A class containing all functions for auto-regressive text generation, to be used as a mixin in [`PreTrainedModel`].
+
+    The class exposes [`~generation.GenerationMixin.generate`], which can be used for:
+        - *greedy decoding* if `num_beams=1` and `do_sample=False`
+        - *contrastive search* if `penalty_alpha>0` and `top_k>1`
+        - *multinomial sampling* if `num_beams=1` and `do_sample=True`
+        - *beam-search decoding* if `num_beams>1` and `do_sample=False`
+        - *beam-search multinomial sampling* if `num_beams>1` and `do_sample=True`
+        - *diverse beam-search decoding* if `num_beams>1` and `num_beam_groups>1`
+        - *constrained beam-search decoding* if `constraints!=None` or `force_words_ids!=None`
+        - *assisted decoding* if `assistant_model` or `prompt_lookup_num_tokens` is passed to `.generate()`
+
+    To learn more about decoding strategies refer to the [text generation strategies guide](../generation_strategies).
+    """
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids: torch.LongTensor,
+        past_key_values: Optional[Cache] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ):
+        """
+        Prepare the model inputs for generation. In includes operations like computing the 4D attention mask or
+        slicing inputs given the existing cache.
+
+        See the forward pass in the model documentation for expected arguments (different models might have different
+        requirements for e.g. `past_key_values`). This function should work as is for most LLMs.
+        """
+
+        # 1. Handle BC:
+        model_inputs = {}
+        # - some models don't have `Cache` support (which implies they don't expect `cache_position` in `forward`)
+        if self._supports_cache_class:
+            model_inputs["cache_position"] = cache_position
+        # - `cache_position` was not a mandatory input in `prepare_inputs_for_generation` for those models, and this
+        #   function may be called outside of `generate`. Handle most use cases by creating `cache_position` on the fly
+        #   (this alternative is not as robust as calling `generate` and letting it create `cache_position`)
+        elif cache_position is None:
+            past_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
+            cache_position = torch.arange(past_length, input_ids.shape[1], dtype=torch.long, device=input_ids.device)
+
+        # 2. Generic cache-dependent input preparation
+        # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens
+        # Exception 1: when passing input_embeds, input_ids may be missing entries
+        # Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here
+        # Exception 3: with synced GPUs cache_position may go out of bounds, but we only want dummy token in that case.
+        #              (we can't check exception 3 while compiling)
+        if past_key_values is not None:
+            model_inputs["past_key_values"] = past_key_values
+            if (
+                inputs_embeds is not None  # Exception 1
+                or (is_torchdynamo_compiling() or cache_position[-1] >= input_ids.shape[1])  # Exception 3
+            ):
+                input_ids = input_ids[:, -cache_position.shape[0] :]
+            elif input_ids.shape[1] != cache_position.shape[0]:  # Default case (the "else", a no op, is Exception 2)
+                input_ids = input_ids[:, cache_position]
+
+        # 3. Prepare base model inputs
+        input_ids_key = "decoder_input_ids" if self.config.is_encoder_decoder else "input_ids"
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if not self.config.is_encoder_decoder:
+            if inputs_embeds is not None and cache_position[0] == 0:
+                model_inputs[input_ids_key] = None
+                model_inputs["inputs_embeds"] = inputs_embeds
+            else:
+                # `clone` calls in this function ensure a consistent stride. See #32227
+                model_inputs[input_ids_key] = input_ids.clone(memory_format=torch.contiguous_format)
+                model_inputs["inputs_embeds"] = None
+        else:
+            model_inputs[input_ids_key] = input_ids.clone(memory_format=torch.contiguous_format)
+
+        # 4. Create missing `position_ids` on the fly
+        if (
+            attention_mask is not None
+            and kwargs.get("position_ids") is None
+            and "position_ids" in set(inspect.signature(self.forward).parameters.keys())
+        ):
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            kwargs["position_ids"] = position_ids  # placed in kwargs for further processing (see below)
+
+        # 5. Slice model inputs if it's an input that should have the same length as `input_ids`
+        for model_input_name in ["position_ids", "token_type_ids"]:
+            model_input = kwargs.get(model_input_name)
+            if model_input is not None:
+                if past_key_values is not None:
+                    current_input_length = (
+                        model_inputs["inputs_embeds"].shape[1]
+                        if model_inputs["inputs_embeds"] is not None
+                        else model_inputs[input_ids_key].shape[1]
+                    )
+                    model_input = model_input[:, -current_input_length:]
+                    model_input = model_input.clone(memory_format=torch.contiguous_format)
+                model_inputs[model_input_name] = model_input
+
+        # 6. Create 4D attention mask is we are using a `StaticCache` (important for performant compiled forward pass)
+        if isinstance(past_key_values, StaticCache) and attention_mask.ndim == 2:
+            if model_inputs["inputs_embeds"] is not None:
+                batch_size, sequence_length, _ = model_inputs["inputs_embeds"].shape
+                device = model_inputs["inputs_embeds"].device
+            else:
+                batch_size, sequence_length = model_inputs[input_ids_key].shape
+                device = model_inputs[input_ids_key].device
+
+            # Create the causal mask with fixed shape in advance, to reduce recompilations. If the function to create
+            # the 4D causal mask exists, it should be present in the base model (XXXModel class).
+            base_model = getattr(self, self.base_model_prefix, None)
+            if base_model is None:
+                causal_mask_creation_function = getattr(
+                    self, "_prepare_4d_causal_attention_mask_with_cache_position", None
+                )
+            else:
+                causal_mask_creation_function = getattr(
+                    base_model, "_prepare_4d_causal_attention_mask_with_cache_position", None
+                )
+            if causal_mask_creation_function is None:
+                logger.warning_once(
+                    f"{self.__class__.__name__} has no `_prepare_4d_causal_attention_mask_with_cache_position` method "
+                    "defined in its base modeling class. Compiled forward passes will be sub-optimal. If you're "
+                    "writing code, see Llama for an example implementation. If you're a user, please report this "
+                    "issue on GitHub."
+                )
+            else:
+                attention_mask = causal_mask_creation_function(
+                    attention_mask,
+                    sequence_length=sequence_length,
+                    target_length=past_key_values.get_max_cache_shape(),
+                    dtype=self.dtype,
+                    device=device,
+                    cache_position=cache_position,
+                    batch_size=batch_size,
+                    config=self.config,
+                    past_key_values=past_key_values,
+                )
+        if attention_mask is not None:
+            model_inputs["attention_mask"] = attention_mask
+
+        # 7. Forward ALL kwargs that are uninitialized (e.g. `use_cache`).
+        for key, value in kwargs.items():
+            if key not in model_inputs:
+                model_inputs[key] = value
+
+        # 8. Remove unexpected `generate` inputs (TODO @joao: fix trainer and examples)
+        model_inputs.pop("labels", None)
+        return model_inputs
+
+    def _prepare_model_inputs(
+        self,
+        inputs: Optional[torch.Tensor] = None,
+        bos_token_id: Optional[torch.Tensor] = None,
+        model_kwargs: Optional[Dict[str, torch.Tensor]] = None,
+    ) -> Tuple[torch.Tensor, Optional[str], Dict[str, torch.Tensor]]:
+        """
+        This function extracts the model-specific `inputs` for generation.
+        """
+        # 1. retrieve all kwargs that are non-None or non-model input related.
+        # some encoder-decoder models have different names for model and encoder
+        if (
+            self.config.is_encoder_decoder
+            and hasattr(self, "encoder")
+            and self.encoder.main_input_name != self.main_input_name
+        ):
+            input_name = self.encoder.main_input_name
+        else:
+            input_name = self.main_input_name
+
+        model_kwargs = {k: v for k, v in model_kwargs.items() if v is not None or k != input_name}
+
+        # 2. check whether model_input_name is passed as kwarg
+        # if yes and `inputs` is None use kwarg inputs
+        inputs_kwarg = model_kwargs.pop(input_name, None)
+        if inputs_kwarg is not None and inputs is not None:
+            raise ValueError(
+                f"`inputs`: {inputs}` were passed alongside {input_name} which is not allowed. "
+                f"Make sure to either pass {inputs} or {input_name}=..."
+            )
+        elif inputs_kwarg is not None:
+            inputs = inputs_kwarg
+
+        # 3. In the presence of `inputs_embeds` for text models:
+        # - decoder-only models should complain if the user attempts to pass `inputs_embeds`, but the model
+        # doesn't have its forwarding implemented. `inputs_embeds` is kept in `model_kwargs` and can coexist with
+        # input_ids (`inputs_embeds` will be used in the 1st generation step, as opposed to `input_ids`)
+        # - encoder-decoder models should complain if the user attempts to pass `inputs_embeds` and `input_ids`, and
+        # pull the former to inputs. It will be used in place of `input_ids` to get the encoder hidden states.
+        if input_name == "input_ids" and "inputs_embeds" in model_kwargs:
+            if not self.config.is_encoder_decoder:
+                has_inputs_embeds_forwarding = "inputs_embeds" in set(
+                    inspect.signature(self.prepare_inputs_for_generation).parameters.keys()
+                )
+                if not has_inputs_embeds_forwarding:
+                    raise ValueError(
+                        f"You passed `inputs_embeds` to `.generate()`, but the model class {self.__class__.__name__} "
+                        "doesn't have its forwarding implemented. See the GPT2 implementation for an example "
+                        "(https://github.com/huggingface/transformers/pull/21405), and feel free to open a PR with it!"
+                    )
+                # In this case, `input_ids` is moved to the `model_kwargs`, so a few automations (like the creation of
+                # the attention mask) can rely on the actual model input.
+                model_kwargs["input_ids"] = self._maybe_initialize_input_ids_for_generation(
+                    inputs, bos_token_id, model_kwargs=model_kwargs
+                )
+            else:
+                if inputs is not None:
+                    raise ValueError("You passed `inputs_embeds` and `input_ids` to `.generate()`. Please pick one.")
+            inputs, input_name = model_kwargs["inputs_embeds"], "inputs_embeds"
+
+        # 4. if `inputs` is still None, try to create `input_ids` from BOS token
+        inputs = self._maybe_initialize_input_ids_for_generation(inputs, bos_token_id, model_kwargs)
+        return inputs, input_name, model_kwargs
+
+    def _maybe_initialize_input_ids_for_generation(
+        self,
+        inputs: Optional[torch.Tensor] = None,
+        bos_token_id: Optional[torch.Tensor] = None,
+        model_kwargs: Optional[Dict[str, torch.Tensor]] = None,
+    ) -> torch.LongTensor:
+        """Initializes input ids for generation, if necessary."""
+        if inputs is not None:
+            return inputs
+
+        encoder_outputs = model_kwargs.get("encoder_outputs")
+        if self.config.is_encoder_decoder and encoder_outputs is not None:
+            # make dummy input_ids with value -100, as a sanity check ensuring that they won't be used for encoding
+            shape = encoder_outputs.last_hidden_state.size()[:-1]
+            return torch.ones(shape, dtype=torch.long, device=self.device) * -100
+
+        # If there is some tensor in `model_kwargs`, we can infer the batch size from it. This is helpful with
+        # soft-prompting or in multimodal implementations built on top of decoder-only language models.
+        batch_size = 1
+        for value in model_kwargs.values():
+            if isinstance(value, torch.Tensor):
+                batch_size = value.shape[0]
+                break
+
+        if "inputs_embeds" in model_kwargs:
+            return torch.ones((batch_size, 0), dtype=torch.long, device=self.device)
+
+        if bos_token_id is None:
+            raise ValueError("`bos_token_id` has to be defined when no `input_ids` are provided.")
+
+        return torch.ones((batch_size, 1), dtype=torch.long, device=self.device) * bos_token_id
+
+    def _prepare_attention_mask_for_generation(
+        self,
+        inputs_tensor: torch.Tensor,
+        generation_config: GenerationConfig,
+        model_kwargs: Dict[str, Any],
+    ) -> torch.LongTensor:
+        pad_token_id = generation_config._pad_token_tensor
+        eos_token_id = generation_config._eos_token_tensor
+
+        # `input_ids` may be present in the model kwargs, instead of being the main input (e.g. multimodal model)
+        if "input_ids" in model_kwargs and model_kwargs["input_ids"].shape[1] > 0:
+            inputs_tensor = model_kwargs["input_ids"]
+
+        # No information for attention mask inference -> return default attention mask
+        default_attention_mask = torch.ones(inputs_tensor.shape[:2], dtype=torch.long, device=inputs_tensor.device)
+        if pad_token_id is None:
+            return default_attention_mask
+
+        is_input_ids = len(inputs_tensor.shape) == 2 and inputs_tensor.dtype in [torch.int, torch.long]
+        if not is_input_ids:
+            return default_attention_mask
+
+        is_pad_token_in_inputs = (pad_token_id is not None) and (
+            isin_mps_friendly(elements=inputs_tensor, test_elements=pad_token_id).any()
+        )
+        is_pad_token_not_equal_to_eos_token_id = (eos_token_id is None) or ~(
+            isin_mps_friendly(elements=eos_token_id, test_elements=pad_token_id).any()
+        )
+        can_infer_attention_mask = is_pad_token_in_inputs * is_pad_token_not_equal_to_eos_token_id
+        attention_mask_from_padding = inputs_tensor.ne(pad_token_id).long()
+
+        attention_mask = (
+            attention_mask_from_padding * can_infer_attention_mask + default_attention_mask * ~can_infer_attention_mask
+        )
+        return attention_mask
+
+    def _prepare_encoder_decoder_kwargs_for_generation(
+        self,
+        inputs_tensor: torch.Tensor,
+        model_kwargs,
+        model_input_name: Optional[str],
+        generation_config: GenerationConfig,
+    ) -> Dict[str, Any]:
+        # 1. get encoder
+        encoder = self.get_encoder()
+        # Compatibility with Accelerate big model inference: we need the encoder to outputs stuff on the same device
+        # as the inputs.
+        if hasattr(self, "hf_device_map"):
+            if hasattr(encoder, "_hf_hook"):
+                encoder._hf_hook.io_same_device = True
+            else:
+                add_hook_to_module(encoder, AlignDevicesHook(io_same_device=True))
+
+        # 2. Prepare encoder args and encoder kwargs from model kwargs and generation config.
+        irrelevant_prefix = ["decoder_", "cross_attn", "use_cache"]
+        encoder_kwargs = {
+            argument: value
+            for argument, value in model_kwargs.items()
+            if not any(argument.startswith(p) for p in irrelevant_prefix)
+        }
+        encoder_signature = set(inspect.signature(encoder.forward).parameters)
+        encoder_accepts_wildcard = "kwargs" in encoder_signature or "model_kwargs" in encoder_signature
+        if not encoder_accepts_wildcard:
+            encoder_kwargs = {
+                argument: value for argument, value in encoder_kwargs.items() if argument in encoder_signature
+            }
+        encoder_kwargs["output_attentions"] = generation_config.output_attentions
+        encoder_kwargs["output_hidden_states"] = generation_config.output_hidden_states
+
+        # 3. make sure that encoder returns `ModelOutput`
+        model_input_name = model_input_name if model_input_name is not None else self.main_input_name
+        encoder_kwargs["return_dict"] = True
+        encoder_kwargs[model_input_name] = inputs_tensor
+        model_kwargs["encoder_outputs"]: ModelOutput = encoder(**encoder_kwargs)  # type: ignore
+
+        return model_kwargs
+
+    def _prepare_decoder_input_ids_for_generation(
+        self,
+        batch_size: int,
+        model_input_name: str,
+        model_kwargs: Dict[str, torch.Tensor],
+        decoder_start_token_id: torch.Tensor,
+        device: torch.device = None,
+    ) -> Tuple[torch.LongTensor, Dict[str, torch.Tensor]]:
+        """Prepares `decoder_input_ids` for generation with encoder-decoder models"""
+        # 1. Check whether the user has defined `decoder_input_ids` manually. To facilitate in terms of input naming,
+        # we also allow the user to pass it under `input_ids`, if the encoder does not use it as the main input.
+        if model_kwargs is not None and "decoder_input_ids" in model_kwargs:
+            decoder_input_ids = model_kwargs.pop("decoder_input_ids")
+        elif "input_ids" in model_kwargs and model_input_name != "input_ids":
+            decoder_input_ids = model_kwargs.pop("input_ids")
+        else:
+            decoder_input_ids = None
+
+        # 2. `decoder_start_token_id` must have shape (batch_size, 1)
+        if device is None:
+            device = self.device
+        if decoder_start_token_id.ndim == 1:
+            if decoder_start_token_id.shape[0] != batch_size:
+                raise ValueError(
+                    f"`decoder_start_token_id` expected to have length {batch_size} but got {decoder_start_token_id.shape[0]}"
+                )
+            decoder_start_token_id = decoder_start_token_id.view(-1, 1)
+        else:
+            decoder_start_token_id = (
+                torch.ones((batch_size, 1), dtype=torch.long, device=device) * decoder_start_token_id
+            )
+
+        # 3. Encoder-decoder models expect the `decoder_input_ids` to start with a special token. Let's ensure that.
+        # no user input -> use decoder_start_token_id as decoder_input_ids
+        if decoder_input_ids is None:
+            decoder_input_ids = decoder_start_token_id
+        # exception: Donut checkpoints have task-specific decoder starts and don't expect a BOS token. Note that the
+        # original checkpoints can't be detected through `self.__class__.__name__.lower()`, needing custom logic.
+        # See: https://github.com/huggingface/transformers/pull/31470
+        elif "donut" in self.__class__.__name__.lower() or (
+            self.config.model_type == "vision-encoder-decoder" and "donut" in self.config.encoder.model_type.lower()
+        ):
+            pass
+        elif self.config.model_type in ["whisper"]:
+            pass
+        # user input but doesn't start with decoder_start_token_id -> prepend decoder_start_token_id (and adjust
+        # decoder_attention_mask if provided)
+        elif (decoder_input_ids[:, 0] != decoder_start_token_id[:, 0]).all().item():
+            decoder_input_ids = torch.cat([decoder_start_token_id, decoder_input_ids], dim=-1)
+            if "decoder_attention_mask" in model_kwargs:
+                decoder_attention_mask = model_kwargs["decoder_attention_mask"]
+                decoder_attention_mask = torch.cat(
+                    (torch.ones_like(decoder_attention_mask)[:, :1], decoder_attention_mask),
+                    dim=-1,
+                )
+                model_kwargs["decoder_attention_mask"] = decoder_attention_mask
+
+        return decoder_input_ids, model_kwargs
+
+    @staticmethod
+    def _expand_inputs_for_generation(
+        expand_size: int = 1,
+        is_encoder_decoder: bool = False,
+        input_ids: Optional[torch.LongTensor] = None,
+        **model_kwargs,
+    ) -> Tuple[torch.LongTensor, Dict[str, Any]]:
+        """Expands tensors from [batch_size, ...] to [batch_size * expand_size, ...]"""
+        # Do not call torch.repeat_interleave if expand_size is 1 because it clones
+        # the input tensor and thus requires more memory although no change is applied
+        if expand_size == 1:
+            return input_ids, model_kwargs
+
+        def _expand_dict_for_generation(dict_to_expand):
+            for key in dict_to_expand:
+                if (
+                    key != "cache_position"
+                    and dict_to_expand[key] is not None
+                    and isinstance(dict_to_expand[key], torch.Tensor)
+                ):
+                    dict_to_expand[key] = dict_to_expand[key].repeat_interleave(expand_size, dim=0)
+            return dict_to_expand
+
+        if input_ids is not None:
+            input_ids = input_ids.repeat_interleave(expand_size, dim=0)
+
+        model_kwargs = _expand_dict_for_generation(model_kwargs)
+
+        if is_encoder_decoder:
+            if model_kwargs.get("encoder_outputs") is None:
+                raise ValueError("If `is_encoder_decoder` is True, make sure that `encoder_outputs` is defined.")
+            model_kwargs["encoder_outputs"] = _expand_dict_for_generation(model_kwargs["encoder_outputs"])
+
+        return input_ids, model_kwargs
+
+    def _extract_past_from_model_output(self, outputs: ModelOutput):
+        past_key_values = None
+        cache_name = "past_key_values"
+        if "past_key_values" in outputs:
+            past_key_values = outputs.past_key_values
+        elif "mems" in outputs:
+            past_key_values = outputs.mems
+        elif "past_buckets_states" in outputs:
+            past_key_values = outputs.past_buckets_states
+        elif "cache_params" in outputs:
+            past_key_values = outputs.cache_params
+            cache_name = "cache_params"
+
+        return cache_name, past_key_values
+
+    def _update_model_kwargs_for_generation(
+        self,
+        outputs: ModelOutput,
+        model_kwargs: Dict[str, Any],
+        is_encoder_decoder: bool = False,
+        num_new_tokens: int = 1,
+    ) -> Dict[str, Any]:
+        # update past_key_values keeping its naming used in model code
+        cache_name, cache = self._extract_past_from_model_output(outputs)
+        model_kwargs[cache_name] = cache
+        if getattr(outputs, "state", None) is not None:
+            model_kwargs["state"] = outputs.state
+
+        # update token_type_ids with last value
+        if "token_type_ids" in model_kwargs:
+            token_type_ids = model_kwargs["token_type_ids"]
+            model_kwargs["token_type_ids"] = torch.cat([token_type_ids, token_type_ids[:, -1].unsqueeze(-1)], dim=-1)
+
+        if not is_encoder_decoder:
+            # update attention mask
+            if "attention_mask" in model_kwargs:
+                attention_mask = model_kwargs["attention_mask"]
+                model_kwargs["attention_mask"] = torch.cat(
+                    [attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1
+                )
+        else:
+            # update decoder attention mask
+            if "decoder_attention_mask" in model_kwargs:
+                decoder_attention_mask = model_kwargs["decoder_attention_mask"]
+                model_kwargs["decoder_attention_mask"] = torch.cat(
+                    [decoder_attention_mask, decoder_attention_mask.new_ones((decoder_attention_mask.shape[0], 1))],
+                    dim=-1,
+                )
+
+        if model_kwargs.get("use_cache", True):
+            model_kwargs["cache_position"] = model_kwargs["cache_position"][-1:] + num_new_tokens
+        else:
+            past_positions = model_kwargs.pop("cache_position")
+            new_positions = torch.arange(
+                past_positions[-1] + 1, past_positions[-1] + num_new_tokens + 1, dtype=past_positions.dtype
+            ).to(past_positions.device)
+            model_kwargs["cache_position"] = torch.cat((past_positions, new_positions))
+        return model_kwargs
+
+    def _reorder_cache(self, past_key_values, beam_idx):
+        raise NotImplementedError(
+            f"Make sure that a `_reorder_cache` function is correctly implemented in {self.__class__.__module__} to"
+            f" enable beam search for {self.__class__}"
+        )
+
+    def _get_candidate_generator(
+        self,
+        generation_config: GenerationConfig,
+        input_ids: torch.LongTensor,
+        inputs_tensor: torch.Tensor,
+        assistant_model: "PreTrainedModel",
+        logits_processor: LogitsProcessorList,
+        target_tokenizer: "PreTrainedTokenizerBase",
+        assistant_tokenizer: "PreTrainedTokenizerBase",
+        model_kwargs: Dict,
+    ) -> CandidateGenerator:
+        """
+        Returns the candidate generator to be used in `assisted_generation`
+        """
+        different_tokenizers = all(v is not None for v in (assistant_model, target_tokenizer, assistant_tokenizer))
+
+        if generation_config.assistant_early_exit is not None:
+            candidate_generator = EarlyExitCandidateGenerator(
+                input_ids=input_ids,
+                assistant_model=self,
+                generation_config=generation_config,
+                model_kwargs=model_kwargs,
+                inputs_tensor=inputs_tensor,
+                logits_processor=logits_processor,
+            )
+        elif generation_config.prompt_lookup_num_tokens is not None:
+            candidate_generator = PromptLookupCandidateGenerator(
+                eos_token_id=generation_config._eos_token_tensor,
+                num_output_tokens=generation_config.prompt_lookup_num_tokens,
+                max_matching_ngram_size=generation_config.max_matching_ngram_size,
+                max_length=generation_config.max_length,
+            )
+        elif different_tokenizers:
+            candidate_generator = AssistedCandidateGeneratorDifferentTokenizers(
+                input_ids=input_ids,
+                assistant_model=assistant_model,
+                generation_config=generation_config,
+                model_kwargs=model_kwargs,
+                inputs_tensor=inputs_tensor,
+                logits_processor=logits_processor,
+                target_tokenizer=target_tokenizer,
+                assistant_tokenizer=assistant_tokenizer,
+            )
+        else:
+            candidate_generator = AssistedCandidateGenerator(
+                input_ids=input_ids,
+                assistant_model=assistant_model,
+                generation_config=generation_config,
+                model_kwargs=model_kwargs,
+                inputs_tensor=inputs_tensor,
+                logits_processor=logits_processor,
+            )
+        return candidate_generator
+
+    def _get_logits_processor(
+        self,
+        generation_config: GenerationConfig,
+        input_ids_seq_length: int,
+        encoder_input_ids: torch.LongTensor,
+        prefix_allowed_tokens_fn: Callable[[int, torch.Tensor], List[int]],
+        logits_processor: Optional[LogitsProcessorList],
+        device: str = None,
+        model_kwargs: Optional[Dict[str, Any]] = None,
+        negative_prompt_ids: Optional[torch.Tensor] = None,
+        negative_prompt_attention_mask: Optional[torch.Tensor] = None,
+    ) -> LogitsProcessorList:
+        """
+        This class returns a [`LogitsProcessorList`] list object that contains all relevant [`LogitsProcessor`]
+        instances used to modify the scores of the language model head.
+        """
+        # instantiate processors list
+        processors = LogitsProcessorList()
+
+        if generation_config.guidance_scale is not None and generation_config.guidance_scale != 1:
+            processors.append(
+                UnbatchedClassifierFreeGuidanceLogitsProcessor(
+                    generation_config.guidance_scale,
+                    self,
+                    unconditional_ids=negative_prompt_ids,
+                    unconditional_attention_mask=negative_prompt_attention_mask,
+                    use_cache=generation_config.use_cache,
+                )
+            )
+        if generation_config.sequence_bias is not None:
+            processors.append(SequenceBiasLogitsProcessor(sequence_bias=generation_config.sequence_bias))
+
+        if generation_config.diversity_penalty is not None and generation_config.diversity_penalty > 0.0:
+            processors.append(
+                HammingDiversityLogitsProcessor(
+                    diversity_penalty=generation_config.diversity_penalty,
+                    num_beams=generation_config.num_beams,
+                    num_beam_groups=generation_config.num_beam_groups,
+                )
+            )
+        if (
+            generation_config.encoder_repetition_penalty is not None
+            and generation_config.encoder_repetition_penalty != 1.0
+        ):
+            if len(encoder_input_ids.shape) == 2:
+                processors.append(
+                    EncoderRepetitionPenaltyLogitsProcessor(
+                        penalty=generation_config.encoder_repetition_penalty,
+                        encoder_input_ids=encoder_input_ids,
+                    )
+                )
+            else:
+                warnings.warn(
+                    "Passing `encoder_repetition_penalty` requires some form of `input_ids` to be passed to "
+                    "`generate`, ignoring the argument.",
+                    UserWarning,
+                )
+        if generation_config.repetition_penalty is not None and generation_config.repetition_penalty != 1.0:
+            processors.append(RepetitionPenaltyLogitsProcessor(penalty=generation_config.repetition_penalty))
+        if generation_config.no_repeat_ngram_size is not None and generation_config.no_repeat_ngram_size > 0:
+            processors.append(NoRepeatNGramLogitsProcessor(generation_config.no_repeat_ngram_size))
+        if (
+            generation_config.encoder_no_repeat_ngram_size is not None
+            and generation_config.encoder_no_repeat_ngram_size > 0
+        ):
+            if len(encoder_input_ids.shape) == 2:
+                processors.append(
+                    EncoderNoRepeatNGramLogitsProcessor(
+                        generation_config.encoder_no_repeat_ngram_size,
+                        encoder_input_ids,
+                    )
+                )
+            else:
+                warnings.warn(
+                    "Passing `encoder_no_repeat_ngram_size` requires some form of `input_ids` to be passed to "
+                    "`generate`, ignoring the argument.",
+                    UserWarning,
+                )
+        if generation_config.bad_words_ids is not None:
+            processors.append(
+                NoBadWordsLogitsProcessor(
+                    generation_config.bad_words_ids,
+                    generation_config._eos_token_tensor,
+                )
+            )
+        if (
+            generation_config.min_length is not None
+            and generation_config._eos_token_tensor is not None
+            and generation_config.min_length > 0
+        ):
+            processors.append(
+                MinLengthLogitsProcessor(
+                    generation_config.min_length,
+                    generation_config._eos_token_tensor,
+                    device=device,
+                )
+            )
+        if (
+            generation_config.min_new_tokens is not None
+            and generation_config._eos_token_tensor is not None
+            and generation_config.min_new_tokens > 0
+        ):
+            processors.append(
+                MinNewTokensLengthLogitsProcessor(
+                    input_ids_seq_length,
+                    generation_config.min_new_tokens,
+                    generation_config._eos_token_tensor,
+                    device=device,
+                )
+            )
+        if prefix_allowed_tokens_fn is not None:
+            processors.append(
+                PrefixConstrainedLogitsProcessor(
+                    prefix_allowed_tokens_fn,
+                    generation_config.num_beams // generation_config.num_beam_groups,
+                )
+            )
+        if generation_config.forced_bos_token_id is not None:
+            processors.append(
+                ForcedBOSTokenLogitsProcessor(
+                    generation_config.forced_bos_token_id,
+                )
+            )
+        if generation_config.forced_eos_token_id is not None:
+            processors.append(
+                ForcedEOSTokenLogitsProcessor(
+                    generation_config.max_length,
+                    generation_config.forced_eos_token_id,
+                    device=device,
+                )
+            )
+        if generation_config.remove_invalid_values is True:
+            processors.append(InfNanRemoveLogitsProcessor())
+        if generation_config.exponential_decay_length_penalty is not None:
+            processors.append(
+                ExponentialDecayLengthPenalty(
+                    generation_config.exponential_decay_length_penalty,
+                    generation_config._eos_token_tensor,
+                    input_ids_seq_length,
+                )
+            )
+        if generation_config.suppress_tokens is not None:
+            processors.append(
+                SuppressTokensLogitsProcessor(
+                    generation_config.suppress_tokens,
+                    device=device,
+                )
+            )
+        if generation_config.begin_suppress_tokens is not None:
+            begin_index = input_ids_seq_length
+            begin_index = (
+                begin_index
+                if (input_ids_seq_length > 1 or generation_config.forced_bos_token_id is None)
+                else begin_index + 1
+            )
+            processors.append(
+                SuppressTokensAtBeginLogitsProcessor(
+                    generation_config.begin_suppress_tokens,
+                    begin_index,
+                    device=device,
+                )
+            )
+        if generation_config.forced_decoder_ids is not None:
+            # TODO (sanchit): move this exception to GenerationConfig.validate() when TF & FLAX are aligned with PT
+            raise ValueError(
+                "You have explicitly specified `forced_decoder_ids`. Please remove the `forced_decoder_ids` argument "
+                "in favour of `input_ids` or `decoder_input_ids` respectively.",
+            )
+
+        # TODO (joao): find a strategy to specify the order of the processors
+        processors = self._merge_criteria_processor_list(processors, logits_processor)
+
+        # Processors previously known as `LogitsWarpers`, only applied with sampling strategies
+        if generation_config.do_sample:
+            # In beam methods, we need to keep at least one non-eos token to explore continuations that might have a
+            # better score (i.e. keep len(list(generation_config._eos_token_tensor)) + 1)
+            if generation_config.num_beams > 1:
+                if isinstance(generation_config._eos_token_tensor, list):
+                    min_tokens_to_keep = len(generation_config._eos_token_tensor) + 1
+                elif isinstance(generation_config._eos_token_tensor, torch.Tensor):
+                    min_tokens_to_keep = generation_config._eos_token_tensor.shape[0] + 1
+                else:
+                    min_tokens_to_keep = 2
+            else:
+                min_tokens_to_keep = 1
+
+            # the following idea is largely copied from this PR: https://github.com/huggingface/transformers/pull/5420/files
+            # all samplers can be found in `generation_utils_samplers.py`
+            if generation_config.temperature is not None and generation_config.temperature != 1.0:
+                processors.append(TemperatureLogitsWarper(generation_config.temperature))
+            if generation_config.top_k is not None and generation_config.top_k != 0:
+                processors.append(
+                    TopKLogitsWarper(top_k=generation_config.top_k, min_tokens_to_keep=min_tokens_to_keep)
+                )
+            if generation_config.top_p is not None and generation_config.top_p < 1.0:
+                processors.append(
+                    TopPLogitsWarper(top_p=generation_config.top_p, min_tokens_to_keep=min_tokens_to_keep)
+                )
+            if generation_config.min_p is not None:
+                # Applied after temperature scaling (see https://github.com/ggerganov/llama.cpp/pull/3841#issuecomment-2073826084)
+                processors.append(
+                    MinPLogitsWarper(min_p=generation_config.min_p, min_tokens_to_keep=min_tokens_to_keep)
+                )
+            if generation_config.typical_p is not None and generation_config.typical_p < 1.0:
+                processors.append(
+                    TypicalLogitsWarper(mass=generation_config.typical_p, min_tokens_to_keep=min_tokens_to_keep)
+                )
+            if generation_config.epsilon_cutoff is not None and 0.0 < generation_config.epsilon_cutoff < 1.0:
+                processors.append(
+                    EpsilonLogitsWarper(
+                        epsilon=generation_config.epsilon_cutoff, min_tokens_to_keep=min_tokens_to_keep
+                    )
+                )
+            if generation_config.eta_cutoff is not None and 0.0 < generation_config.eta_cutoff < 1.0:
+                processors.append(
+                    EtaLogitsWarper(
+                        epsilon=generation_config.eta_cutoff, min_tokens_to_keep=min_tokens_to_keep, device=device
+                    )
+                )
+
+        # Watermarking should be after all logits processing is finished (see #34630)
+        if generation_config.watermarking_config is not None:
+            processors.append(
+                generation_config.watermarking_config.construct_processor(self.config.vocab_size, device)
+            )
+
+        # `LogitNormalization` should always be the last logit processor, when present
+        if generation_config.renormalize_logits is True:
+            processors.append(LogitNormalization())
+        return processors
+
+    def _get_stopping_criteria(
+        self,
+        generation_config: GenerationConfig,
+        stopping_criteria: Optional[StoppingCriteriaList],
+        tokenizer: Optional["PreTrainedTokenizerBase"] = None,
+        **kwargs,
+    ) -> StoppingCriteriaList:
+        criteria = StoppingCriteriaList()
+        if generation_config.max_length is not None:
+            max_position_embeddings = getattr(self.config, "max_position_embeddings", None)
+            criteria.append(
+                MaxLengthCriteria(
+                    max_length=generation_config.max_length,
+                    max_position_embeddings=max_position_embeddings,
+                )
+            )
+        if generation_config.max_time is not None:
+            criteria.append(MaxTimeCriteria(max_time=generation_config.max_time))
+        if generation_config.stop_strings is not None:
+            if tokenizer is None:
+                raise ValueError(
+                    "There are one or more stop strings, either in the arguments to `generate` or in the "
+                    "model's generation config, but we could not locate a tokenizer. When generating with "
+                    "stop strings, you must pass the model's tokenizer to the `tokenizer` argument of `generate`."
+                )
+            criteria.append(StopStringCriteria(stop_strings=generation_config.stop_strings, tokenizer=tokenizer))
+        if generation_config._eos_token_tensor is not None:
+            criteria.append(EosTokenCriteria(eos_token_id=generation_config._eos_token_tensor))
+        if (
+            generation_config.is_assistant
+            and generation_config.assistant_confidence_threshold is not None
+            and generation_config.assistant_confidence_threshold > 0
+        ):
+            criteria.append(
+                ConfidenceCriteria(assistant_confidence_threshold=generation_config.assistant_confidence_threshold)
+            )
+        criteria = self._merge_criteria_processor_list(criteria, stopping_criteria)
+        return criteria
+
+    def _merge_criteria_processor_list(
+        self,
+        default_list: Union[LogitsProcessorList, StoppingCriteriaList],
+        custom_list: Union[LogitsProcessorList, StoppingCriteriaList],
+    ) -> Union[LogitsProcessorList, StoppingCriteriaList]:
+        if len(custom_list) == 0:
+            return default_list
+        for default in default_list:
+            for custom in custom_list:
+                if type(custom) is type(default):
+                    object_type = "stopping criteria" if isinstance(custom, StoppingCriteria) else "logits processor"
+                    raise ValueError(
+                        f"A custom {object_type} of type {type(custom)} with values {custom} has been passed to"
+                        f" `.generate()`, but it has already been created with the values {default}. {default} has been"
+                        " created by passing the corresponding arguments to generate or by the model's config default"
+                        f" values. If you just want to change the default values of {object_type} consider passing"
+                        f" them as arguments to `.generate()` instead of using a custom {object_type}."
+                    )
+        default_list.extend(custom_list)
+        return default_list
+
+    def compute_transition_scores(
+        self,
+        sequences: torch.Tensor,
+        scores: Tuple[torch.Tensor],
+        beam_indices: Optional[torch.Tensor] = None,
+        normalize_logits: bool = False,
+    ) -> torch.Tensor:
+        """
+        Computes the transition scores of sequences given the generation scores (and beam indices, if beam search was
+        used). This is a convenient method to quicky obtain the scores of the selected tokens at generation time.
+
+        Parameters:
+            sequences (`torch.LongTensor`):
+                The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or
+                shorter if all batches finished early due to the `eos_token_id`.
+            scores (`tuple(torch.FloatTensor)`):
+                Transition scores for each vocabulary token at each generation step. Beam transition scores consisting
+                of log probabilities of tokens conditioned on log softmax of previously generated tokens in this beam.
+                Tuple of `torch.FloatTensor` with up to `max_new_tokens` elements (one element for each generated token),
+                with each tensor of shape `(batch_size*num_beams, config.vocab_size)`.
+            beam_indices (`torch.LongTensor`, *optional*):
+                Beam indices of generated token id at each generation step. `torch.LongTensor` of shape
+                `(batch_size*num_return_sequences, sequence_length)`. Only required if a `num_beams>1` at
+                generate-time.
+            normalize_logits (`bool`, *optional*, defaults to `False`):
+                Whether to normalize the logits (which, for legacy reasons, may be unnormalized).
+
+        Return:
+            `torch.Tensor`: A `torch.Tensor` of shape `(batch_size*num_return_sequences, sequence_length)` containing
+                the transition scores (logits)
+
+        Examples:
+
+        ```python
+        >>> from transformers import GPT2Tokenizer, AutoModelForCausalLM
+        >>> import numpy as np
+
+        >>> tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
+        >>> model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2")
+        >>> tokenizer.pad_token_id = tokenizer.eos_token_id
+        >>> inputs = tokenizer(["Today is"], return_tensors="pt")
+
+        >>> # Example 1: Print the scores for each token generated with Greedy Search
+        >>> outputs = model.generate(**inputs, max_new_tokens=5, return_dict_in_generate=True, output_scores=True)
+        >>> transition_scores = model.compute_transition_scores(
+        ...     outputs.sequences, outputs.scores, normalize_logits=True
+        ... )
+        >>> # input_length is the length of the input prompt for decoder-only models, like the GPT family, and 1 for
+        >>> # encoder-decoder models, like BART or T5.
+        >>> input_length = 1 if model.config.is_encoder_decoder else inputs.input_ids.shape[1]
+        >>> generated_tokens = outputs.sequences[:, input_length:]
+        >>> for tok, score in zip(generated_tokens[0], transition_scores[0]):
+        ...     # | token | token string | log probability | probability
+        ...     print(f"| {tok:5d} | {tokenizer.decode(tok):8s} | {score.numpy():.3f} | {np.exp(score.numpy()):.2%}")
+        |   262 |  the     | -1.414 | 24.33%
+        |  1110 |  day     | -2.609 | 7.36%
+        |   618 |  when    | -2.010 | 13.40%
+        |   356 |  we      | -1.859 | 15.58%
+        |   460 |  can     | -2.508 | 8.14%
+
+        >>> # Example 2: Reconstruct the sequence scores from Beam Search
+        >>> outputs = model.generate(
+        ...     **inputs,
+        ...     max_new_tokens=5,
+        ...     num_beams=4,
+        ...     num_return_sequences=4,
+        ...     return_dict_in_generate=True,
+        ...     output_scores=True,
+        ... )
+        >>> transition_scores = model.compute_transition_scores(
+        ...     outputs.sequences, outputs.scores, outputs.beam_indices, normalize_logits=False
+        ... )
+        >>> # If you sum the generated tokens' scores and apply the length penalty, you'll get the sequence scores.
+        >>> # Tip 1: recomputing the scores is only guaranteed to match with `normalize_logits=False`. Depending on the
+        >>> # use case, you might want to recompute it with `normalize_logits=True`.
+        >>> # Tip 2: the output length does NOT include the input length
+        >>> output_length = np.sum(transition_scores.numpy() < 0, axis=1)
+        >>> length_penalty = model.generation_config.length_penalty
+        >>> reconstructed_scores = transition_scores.sum(axis=1) / (output_length**length_penalty)
+        >>> print(np.allclose(outputs.sequences_scores, reconstructed_scores))
+        True
+        ```"""
+        # 1. In absence of `beam_indices`, we can assume that we come from e.g. greedy search, which is equivalent
+        # to a beam search approach were the first (and only) beam is always selected
+        if beam_indices is None:
+            beam_indices = torch.arange(scores[0].shape[0]).view(-1, 1).to(sequences.device)
+            beam_indices = beam_indices.expand(-1, len(scores))
+
+        # 2. reshape scores as [batch_size*vocab_size, # generation steps] with # generation steps being
+        # seq_len - input_length
+        scores = torch.stack(scores).reshape(len(scores), -1).transpose(0, 1)
+
+        # 3. Optionally normalize the logits (across the vocab dimension)
+        if normalize_logits:
+            scores = scores.reshape(-1, self.config.vocab_size, scores.shape[-1])
+            scores = torch.nn.functional.log_softmax(scores, dim=1)
+            scores = scores.reshape(-1, scores.shape[-1])
+
+        # 4. cut beam_indices to longest beam length
+        beam_indices_mask = beam_indices < 0
+        max_beam_length = (1 - beam_indices_mask.long()).sum(-1).max()
+        beam_indices = beam_indices.clone()[:, :max_beam_length]
+        beam_indices_mask = beam_indices_mask[:, :max_beam_length]
+
+        # 5. Set indices of beams that finished early to 0; such indices will be masked correctly afterwards
+        beam_indices[beam_indices_mask] = 0
+
+        # 6. multiply beam_indices with vocab size to gather correctly from scores
+        beam_sequence_indices = beam_indices * self.config.vocab_size
+
+        # 7. Define which indices contributed to scores
+        cut_idx = sequences.shape[-1] - max_beam_length
+        indices = sequences[:, cut_idx:] + beam_sequence_indices
+
+        # 8. Compute scores
+        transition_scores = scores.gather(0, indices)
+
+        # 9. Mask out transition_scores of beams that stopped early
+        transition_scores[beam_indices_mask] = 0
+
+        return transition_scores
+
+    def _validate_model_class(self):
+        """
+        Confirms that the model class is compatible with generation. If not, raises an exception that points to the
+        right class to use.
+        """
+        # TODO(joao): remove this function in v4.50, i.e. when we remove the inheritance of `GenerationMixin` from
+        # `PreTrainedModel`. With that inheritance removed, all model classes inheriting from `GenerationMixin` can
+        # safely call `GenerationMixin.generate`
+        if not is_torchdynamo_compiling() and not self.can_generate():
+            terminations_with_generation_support = [
+                "ForCausalLM",
+                "ForConditionalGeneration",
+                "ForSpeechSeq2Seq",
+                "ForVision2Seq",
+            ]
+            raise TypeError(
+                f"The current model class ({self.__class__.__name__}) is not compatible with `.generate()`, as "
+                "it doesn't have a language model head. Classes that support generation often end in one of these "
+                f"names: {terminations_with_generation_support}."
+            )
+
+    def _validate_assistant(self, assistant_model, tokenizer, assistant_tokenizer):
+        if assistant_model is None:
+            return
+
+        if self.config.is_encoder_decoder and not assistant_model.config.is_encoder_decoder:
+            attributes_to_check = ["encoder_attention_heads", "encoder_ffn_dim", "encoder_layers"]
+            attributes_to_check = [attr for attr in dir(assistant_model.config) if attr in attributes_to_check]
+            are_equal = all(
+                getattr(self.config, attr) == getattr(assistant_model.config, attr) for attr in attributes_to_check
+            )
+            if not are_equal:
+                raise ValueError(
+                    "The main model and the assistant don't have compatible encoder-dependent input shapes. "
+                    "Ensure you load the assistant with the correct encoder-decoder class, e.g. `AutoModelForSpeechSeq2Seq` for Whisper."
+                )
+
+        doc_reference = (
+            "(see https://huggingface.co/docs/transformers/en/generation_strategies#universal-assisted-decoding)"
+        )
+        if self.config.get_text_config().vocab_size == assistant_model.config.get_text_config().vocab_size:
+            if assistant_tokenizer is not None:
+                raise ValueError(
+                    f"`assistant_tokenizer` is not required when the main and assistant models use the same tokenizer. Please omit `assistant_tokenizer` from `generate()` {doc_reference}."
+                )
+        else:
+            if tokenizer is None or assistant_tokenizer is None:
+                raise ValueError(
+                    f"The main and assistant moedels have different tokenizers. Please provide `tokenizer` and `assistant_tokenizer` to `generate()` {doc_reference}."
+                )
+
+    def _validate_model_kwargs(self, model_kwargs: Dict[str, Any]):
+        """Validates model kwargs for generation. Generate argument typos will also be caught here."""
+        # If a `Cache` instance is passed, checks whether the model is compatible with it
+        if isinstance(model_kwargs.get("past_key_values", None), Cache) and not self._supports_cache_class:
+            raise ValueError(
+                f"{self.__class__.__name__} does not support an instance of `Cache` as `past_key_values`. Please "
+                "check the model documentation for supported cache formats."
+            )
+
+        # Excludes arguments that are handled before calling any model function
+        if self.config.is_encoder_decoder:
+            for key in ["decoder_input_ids"]:
+                model_kwargs.pop(key, None)
+
+        unused_model_args = []
+        model_args = set(inspect.signature(self.prepare_inputs_for_generation).parameters)
+        # `kwargs`/`model_kwargs` is often used to handle optional forward pass inputs like `attention_mask`. If
+        # `prepare_inputs_for_generation` doesn't accept them, then a stricter check can be made ;)
+        if "kwargs" in model_args or "model_kwargs" in model_args:
+            model_args |= set(inspect.signature(self.forward).parameters)
+
+        # Encoder-Decoder models may also need Encoder arguments from `model_kwargs`
+        if self.config.is_encoder_decoder:
+            base_model = getattr(self, self.base_model_prefix, None)
+
+            # allow encoder kwargs
+            encoder = getattr(self, "encoder", None)
+            # `MusicgenForConditionalGeneration` has `text_encoder` and `audio_encoder`.
+            # Also, it has `base_model_prefix = "encoder_decoder"` but there is no `self.encoder_decoder`
+            # TODO: A better way to handle this.
+            if encoder is None and base_model is not None:
+                encoder = getattr(base_model, "encoder", None)
+
+            if encoder is not None:
+                encoder_model_args = set(inspect.signature(encoder.forward).parameters)
+                model_args |= encoder_model_args
+
+            # allow decoder kwargs
+            decoder = getattr(self, "decoder", None)
+            if decoder is None and base_model is not None:
+                decoder = getattr(base_model, "decoder", None)
+
+            if decoder is not None:
+                decoder_model_args = set(inspect.signature(decoder.forward).parameters)
+                model_args |= {f"decoder_{x}" for x in decoder_model_args}
+
+            # allow assistant_encoder_outputs to be passed if we're doing assisted generating
+            if "assistant_encoder_outputs" in model_kwargs:
+                model_args |= {"assistant_encoder_outputs"}
+
+        for key, value in model_kwargs.items():
+            if value is not None and key not in model_args:
+                unused_model_args.append(key)
+
+        if unused_model_args:
+            raise ValueError(
+                f"The following `model_kwargs` are not used by the model: {unused_model_args} (note: typos in the"
+                " generate arguments will also show up in this list)"
+            )
+
+    def _validate_generated_length(self, generation_config, input_ids_length, has_default_max_length):
+        """Performs validation related to the resulting generated length"""
+
+        # Can't throw warnings/exceptions during compilation
+        if is_torchdynamo_compiling():
+            return
+
+        # 1. Max length warnings related to poor parameterization
+        if has_default_max_length and generation_config.max_new_tokens is None and generation_config.max_length == 20:
+            # 20 is the default max_length of the generation config
+            warnings.warn(
+                f"Using the model-agnostic default `max_length` (={generation_config.max_length}) to control the "
+                "generation length. We recommend setting `max_new_tokens` to control the maximum length of the "
+                "generation.",
+                UserWarning,
+            )
+        if input_ids_length >= generation_config.max_length:
+            input_ids_string = "decoder_input_ids" if self.config.is_encoder_decoder else "input_ids"
+            raise ValueError(
+                f"Input length of {input_ids_string} is {input_ids_length}, but `max_length` is set to"
+                f" {generation_config.max_length}. This can lead to unexpected behavior. You should consider"
+                " increasing `max_length` or, better yet, setting `max_new_tokens`."
+            )
+
+        # 2. Min length warnings due to unfeasible parameter combinations
+        min_length_error_suffix = (
+            " Generation will stop at the defined maximum length. You should decrease the minimum length and/or "
+            "increase the maximum length."
+        )
+        if has_default_max_length:
+            min_length_error_suffix += (
+                f" Note that `max_length` is set to {generation_config.max_length}, its default value."
+            )
+        if generation_config.min_length is not None and generation_config.min_length > generation_config.max_length:
+            warnings.warn(
+                f"Unfeasible length constraints: `min_length` ({generation_config.min_length}) is larger than"
+                f" the maximum possible length ({generation_config.max_length})." + min_length_error_suffix,
+                UserWarning,
+            )
+        if generation_config.min_new_tokens is not None:
+            min_length = generation_config.min_new_tokens + input_ids_length
+            if min_length > generation_config.max_length:
+                warnings.warn(
+                    f"Unfeasible length constraints: `min_new_tokens` ({generation_config.min_new_tokens}), when "
+                    f"added to the prompt length ({input_ids_length}), is larger than"
+                    f" the maximum possible length ({generation_config.max_length})." + min_length_error_suffix,
+                    UserWarning,
+                )
+
+    def _prepare_generated_length(
+        self,
+        generation_config,
+        has_default_max_length,
+        has_default_min_length,
+        model_input_name,
+        input_ids_length,
+        inputs_tensor,
+    ):
+        """Prepared max and min length in generation configs to avoid clashes between similar attributes"""
+
+        if generation_config.max_new_tokens is not None:
+            if not has_default_max_length and generation_config.max_length is not None:
+                logger.warning(
+                    f"Both `max_new_tokens` (={generation_config.max_new_tokens}) and `max_length`(="
+                    f"{generation_config.max_length}) seem to have been set. `max_new_tokens` will take precedence. "
+                    "Please refer to the documentation for more information. "
+                    "(https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)"
+                )
+            generation_config.max_length = generation_config.max_new_tokens + input_ids_length
+
+        # if both `inputs_embeds` and `input_ids` are passed, we do not correct the length
+        # otherwise we need total length [inputs-embeds-len + new-tokens-len] to not go beyond indicated `max_length``
+        elif (
+            model_input_name == "inputs_embeds"
+            and input_ids_length != inputs_tensor.shape[1]
+            and input_ids_length != 0
+            and not self.config.is_encoder_decoder
+        ):
+            generation_config.max_length -= inputs_tensor.shape[1]
+        elif has_default_max_length:  # by default let's always generate 20 new tokens
+            if generation_config.max_length == GenerationConfig().max_length:
+                generation_config.max_length = generation_config.max_length + input_ids_length
+                max_position_embeddings = getattr(self.config, "max_position_embeddings", None)
+                if max_position_embeddings is not None:
+                    generation_config.max_length = min(generation_config.max_length, max_position_embeddings)
+
+        # same for min length
+        if generation_config.min_new_tokens is not None:
+            if not has_default_min_length:
+                logger.warning(
+                    f"Both `min_new_tokens` (={generation_config.min_new_tokens}) and `min_length`(="
+                    f"{generation_config.min_length}) seem to have been set. `min_new_tokens` will take precedence. "
+                    "Please refer to the documentation for more information. "
+                    "(https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)"
+                )
+            generation_config.min_length = generation_config.min_new_tokens + input_ids_length
+
+        elif (
+            model_input_name == "inputs_embeds"
+            and input_ids_length != inputs_tensor.shape[1]
+            and not self.config.is_encoder_decoder
+        ):
+            generation_config.min_length = max(generation_config.min_length - inputs_tensor.shape[1], 0)
+
+        return generation_config
+
+    def _prepare_generation_config(
+        self, generation_config: Optional[GenerationConfig], **kwargs: Dict
+    ) -> Tuple[GenerationConfig, Dict]:
+        """
+        Prepares the base generation config, then applies any generation configuration options from kwargs. This
+        function handles retrocompatibility with respect to configuration files.
+        """
+        # TODO joao: when we can detect `fullgraph=True` in `torch.compile` (https://github.com/pytorch/pytorch/pull/120400)
+        # replace `is_torchdynamo_compiling` by the corresponding check. As it is, we are being too restrictive with
+        # the parameterization in `fullgraph=False` so as to enable `fullgraph=True`.
+
+        # priority: `generation_config` argument > `model.generation_config` (the default generation config)
+        using_model_generation_config = False
+        if generation_config is None:
+            # legacy: users may modify the model configuration to control generation. To trigger this legacy behavior,
+            # the following conditions must be met
+            # 1) the generation config must have been created from the model config (`_from_model_config` field);
+            # 2) the generation config must have seen no modification since its creation (the hash is the same);
+            # 3) there are non-default generation parameters in the model config.
+            # 4) the user must have set new generation parameters in the model config.
+            # NOTE: `torch.compile` can't compile `hash`, this legacy support is disabled with compilation.
+            if (
+                not is_torchdynamo_compiling()
+                and self.generation_config._from_model_config  # 1)
+                and self.generation_config._original_object_hash == hash(self.generation_config)  # 2)
+                and len(self.config._get_non_default_generation_parameters()) > 0  # 3)
+            ):
+                new_generation_config = GenerationConfig.from_model_config(self.config)
+                if new_generation_config != self.generation_config:  # 4)
+                    warnings.warn(
+                        "You have modified the pretrained model configuration to control generation. This is a"
+                        " deprecated strategy to control generation and will be removed in v5."
+                        " Please use and modify the model generation configuration (see"
+                        " https://huggingface.co/docs/transformers/generation_strategies#default-text-generation-configuration )",
+                        UserWarning,
+                    )
+                    self.generation_config = new_generation_config
+
+            generation_config = self.generation_config
+            using_model_generation_config = True
+
+        # `torch.compile` can't compile `copy.deepcopy`, arguments in `kwargs` that are part of `generation_config`
+        # will mutate the object with `.update`. As such, passing these arguments through `kwargs` is disabled -- an
+        # exception will be raised in `_validate_model_kwargs`
+        if not is_torchdynamo_compiling():
+            generation_config = copy.deepcopy(generation_config)
+            model_kwargs = generation_config.update(**kwargs)
+            # If `generation_config` is provided, let's fallback ALL special tokens to the default values for the model
+            if not using_model_generation_config:
+                if generation_config.bos_token_id is None:
+                    generation_config.bos_token_id = self.generation_config.bos_token_id
+                if generation_config.eos_token_id is None:
+                    generation_config.eos_token_id = self.generation_config.eos_token_id
+                if generation_config.pad_token_id is None:
+                    generation_config.pad_token_id = self.generation_config.pad_token_id
+                if generation_config.decoder_start_token_id is None:
+                    generation_config.decoder_start_token_id = self.generation_config.decoder_start_token_id
+        else:
+            model_kwargs = kwargs
+
+        return generation_config, model_kwargs
+
+    def _get_initial_cache_position(self, input_ids, model_kwargs):
+        """Calculates `cache_position` for the pre-fill stage based on `input_ids` and optionally past length"""
+        # `torch.compile`-friendly `torch.arange` from a shape -- the lines below are equivalent to `torch.arange`
+        if "inputs_embeds" in model_kwargs and not self.config.is_encoder_decoder:
+            cache_position = torch.ones_like(model_kwargs["inputs_embeds"][0, :, 0], dtype=torch.int64).cumsum(0) - 1
+        elif "decoder_inputs_embeds" in model_kwargs and self.config.is_encoder_decoder:
+            cache_position = (
+                torch.ones_like(model_kwargs["decoder_inputs_embeds"][0, :, 0], dtype=torch.int64).cumsum(0) - 1
+            )
+        else:
+            cache_position = torch.ones_like(input_ids[0, :], dtype=torch.int64).cumsum(0) - 1
+
+        past_length = 0
+        if model_kwargs.get("past_key_values") is not None:
+            cache = model_kwargs["past_key_values"]
+            past_length = 0
+            if not isinstance(cache, Cache):
+                past_length = cache[0][0].shape[2]
+            elif hasattr(cache, "get_seq_length") and cache.get_seq_length() is not None:
+                past_length = cache.get_seq_length()
+
+            # TODO(joao): this is not torch.compile-friendly, find a work-around. If the cache is not empty,
+            # end-to-end compilation will yield bad results because `cache_position` will be incorrect.
+            if not is_torchdynamo_compiling():
+                cache_position = cache_position[past_length:]
+
+        model_kwargs["cache_position"] = cache_position
+        return model_kwargs
+
+    def _get_cache(
+        self, cache_implementation: str, batch_size: int, max_cache_len: int, device: torch.device, model_kwargs
+    ) -> Cache:
+        """
+        Sets a cache for `generate`, that will persist across calls. A new cache will only be initialized a
+        new `generate` call requires a larger cache or uses a different batch size.
+
+        Returns the resulting cache object.
+        """
+        cache_cls: Cache = NEED_SETUP_CACHE_CLASSES_MAPPING[cache_implementation]
+        requires_cross_attention_cache = (
+            self.config.is_encoder_decoder or model_kwargs.get("encoder_outputs") is not None
+        )
+
+        if hasattr(self, "_cache"):
+            cache_to_check = self._cache.self_attention_cache if requires_cross_attention_cache else self._cache
+
+        if cache_implementation == "sliding_window":
+            max_cache_len = min(self.config.sliding_window, max_cache_len)
+
+        need_new_cache = (
+            not hasattr(self, "_cache")
+            or (not isinstance(cache_to_check, cache_cls))
+            or cache_to_check.max_batch_size != batch_size
+        )
+        if cache_implementation != "mamba":
+            need_new_cache = need_new_cache or cache_to_check.max_cache_len < max_cache_len
+
+        if requires_cross_attention_cache and hasattr(self, "_cache"):
+            need_new_cache = (
+                need_new_cache
+                or self._cache.cross_attention_cache.max_cache_len != model_kwargs["encoder_outputs"][0].shape[1]
+            )
+
+        if need_new_cache:
+            if hasattr(self.config, "_pre_quantization_dtype"):
+                cache_dtype = self.config._pre_quantization_dtype
+            else:
+                if not is_torchdynamo_compiling():
+                    cache_dtype = self.dtype
+                else:
+                    # NOTE: self.dtype is not compatible with torch.compile, as it calls `self.parameters()`.
+                    # Workaround: trust the lm_head, whose attribute name is somewhat consistent across generative
+                    # models. May cause trobles with non-text modalities.
+                    cache_dtype = self.get_output_embeddings().weight.dtype
+
+            def get_layer_device_map(execution_device_map: Optional[dict] = None):
+                num_hidden_layers = self.config.get_text_config().num_hidden_layers
+                if execution_device_map is None:
+                    return None
+                elif len(execution_device_map) == 1 and "" in execution_device_map:
+                    return {idx: execution_device_map[""] for idx in range(num_hidden_layers)}
+                layer_device_map = {}
+                for layer in execution_device_map:
+                    for idx in range(num_hidden_layers):
+                        if f".{idx}." in f"{layer}.":
+                            layer_device_map[idx] = execution_device_map[layer]
+                            break
+                for idx in range(num_hidden_layers):
+                    if idx not in layer_device_map:
+                        raise RuntimeError(f"layer {idx} has not been mapped to a device.")
+                return layer_device_map
+
+            execution_device_map = None
+            # Taken from dispatch_model from accelerate.
+            # This is needed here if we don't want to make changes in accelerate in order to save execution_device
+            # For offloaded case, we need to get the execution device, not just the device where it is offloaded
+            if hasattr(self, "hf_device_map"):
+                if set(self.hf_device_map.values()) == {"cpu"} or set(self.hf_device_map.values()) == {"cpu", "disk"}:
+                    main_device = "cpu"
+                else:
+                    main_device = [d for d in self.hf_device_map.values() if d not in ["cpu", "disk"]][0]
+                execution_device_map = {
+                    name: main_device if device in ["cpu", "disk"] else device
+                    for name, device in self.hf_device_map.items()
+                }
+            layer_device_map = get_layer_device_map(execution_device_map)
+
+            cache_kwargs = {
+                "config": self.config.get_text_config(),
+                "max_batch_size": batch_size,
+                "max_cache_len": max_cache_len,
+                "device": device,
+                "dtype": cache_dtype,
+                "layer_device_map": layer_device_map,
+            }
+            self._cache = cache_cls(**cache_kwargs)
+            if requires_cross_attention_cache:
+                encoder_kwargs = cache_kwargs.copy()
+                encoder_kwargs["max_cache_len"] = model_kwargs["encoder_outputs"][0].shape[1]
+                self._cache = EncoderDecoderCache(self._cache, cache_cls(**encoder_kwargs))
+        else:
+            self._cache.reset()
+        return self._cache
+
+    def _supports_default_dynamic_cache(self) -> bool:
+        """
+        Return `True` if current model can use a `DynamicCache` instance when initializing the `past_key_values`.
+        This is mostly the same as `_supports_cache_class` attribute, but add exception for `Jamba` model which
+        uses its own `HybridMambaAttentionDynamicCache` and do not need to initialize the Cache in advance in
+        order to save memory (because no back and forth `to_legacy_cache` and `from_legacy_cache` will be performed
+        for `HybridMambaAttentionDynamicCache`).
+        """
+        return (
+            self._supports_cache_class
+            and "jamba" not in self.__class__.__name__.lower()
+            and "zamba" not in self.__class__.__name__.lower()
+            and "bamba" not in self.__class__.__name__.lower()
+        )
+
+    def _prepare_cache_for_generation(
+        self,
+        generation_config: GenerationConfig,
+        model_kwargs: Dict,
+        assistant_model: "PreTrainedModel",
+        batch_size: int,
+        max_cache_length: int,
+        device: torch.device,
+    ) -> bool:
+        """
+        Prepares the cache for generation (if applicable), given `generate`'s parameterization. If a cache is
+        instantiated, writes it to `model_kwargs`, under the name expected by the model.
+        """
+
+        cache_name = "past_key_values" if "mamba" not in self.__class__.__name__.lower() else "cache_params"
+        requires_cross_attention_cache = (
+            self.config.is_encoder_decoder or model_kwargs.get("encoder_outputs") is not None
+        )
+
+        # Quick escape route 1: if the user specifies a cache, we only need to:
+        # a) check for conflicting `generate` arguments
+        # b) convert to the new cache format (if the user passes a legacy cache and model supports it)
+        user_defined_cache = model_kwargs.get(cache_name)
+        if user_defined_cache is not None:
+            if generation_config.cache_implementation is not None:
+                raise ValueError(
+                    f"Passing both `cache_implementation` (used to initialize certain caches) and `{cache_name}` (a "
+                    "Cache object) is unsupported. Please use only one of the two."
+                )
+            if isinstance(user_defined_cache, tuple) and self._supports_default_dynamic_cache():
+                model_kwargs[cache_name] = (
+                    DynamicCache.from_legacy_cache(user_defined_cache)
+                    if not requires_cross_attention_cache
+                    else EncoderDecoderCache.from_legacy_cache(user_defined_cache)
+                )
+            return
+
+        # Quick escape route 2: if the user specifies no cache is to be used. (conflicting arguments are handled in
+        # `generation_config.validate()`)
+        if generation_config.use_cache is False:
+            return
+
+        # Quick escape route 3: model that only supports legacy caches = nothing to prepare
+        if not self._supports_default_dynamic_cache():
+            if generation_config.cache_implementation is not None:
+                warnings.warn(
+                    "This model does not support `Cache` instances, it only supports the legacy cache format (tuple "
+                    f"of tuples). `cache_implementation` (set to {generation_config.cache_implementation}) will be "
+                    "ignored.",
+                    UserWarning,
+                )
+            return
+
+        # Otherwise we NEED to prepare a cache, based on `generation_config.cache_implementation`
+
+        # TODO(joao): support static caches in assisted generation. assisted generation needs to roll back caches,
+        # which is only supported in dynamic caches atm
+        if assistant_model is not None and generation_config.cache_implementation is not None:
+            logger.warning_once(
+                "An assistant model is provided, using a dynamic cache instead of a cache of type="
+                f"'{generation_config.cache_implementation}'."
+            )
+            generation_config.cache_implementation = None
+
+        if generation_config.cache_implementation is not None:
+            if generation_config.cache_implementation in NEED_SETUP_CACHE_CLASSES_MAPPING:
+                if generation_config.cache_implementation == "static" and not self._supports_static_cache:
+                    raise ValueError(
+                        "This model does not support `cache_implementation='static'`. Please check the following "
+                        "issue: https://github.com/huggingface/transformers/issues/28981"
+                    )
+                model_kwargs[cache_name] = self._get_cache(
+                    cache_implementation=generation_config.cache_implementation,
+                    batch_size=max(generation_config.num_beams, generation_config.num_return_sequences) * batch_size,
+                    max_cache_len=max_cache_length,
+                    device=device,
+                    model_kwargs=model_kwargs,
+                )
+            elif generation_config.cache_implementation == "quantized":
+                if not self._supports_quantized_cache:
+                    raise ValueError(
+                        "This model does not support the quantized cache. If you want your model to support quantized "
+                        "cache, please open an issue and tag @zucchini-nlp."
+                    )
+
+                cache_config = (
+                    generation_config.cache_config
+                    if generation_config.cache_config is not None
+                    else QuantizedCacheConfig()
+                )
+                cache_class = QUANT_BACKEND_CLASSES_MAPPING[cache_config.backend]
+
+                if cache_config.backend == "quanto" and not is_optimum_quanto_available():
+                    raise ImportError(
+                        "You need to install optimum-quanto in order to use KV cache quantization with optimum-quanto backend. "
+                        "Please install it via  with `pip install optimum-quanto`"
+                    )
+                elif cache_config.backend == "HQQ" and not is_hqq_available():
+                    raise ImportError(
+                        "You need to install `HQQ` in order to use KV cache quantization with HQQ backend. "
+                        "Please install it via  with `pip install hqq`"
+                    )
+
+                model_kwargs[cache_name] = cache_class(cache_config)
+            elif generation_config.cache_implementation == "offloaded":
+                model_kwargs[cache_name] = OffloadedCache()
+
+        # Use DynamicCache() instance by default. This will avoid back and forth from legacy format that
+        # keeps copying the cache thus using much more memory
+        else:
+            model_kwargs[cache_name] = (
+                DynamicCache()
+                if not requires_cross_attention_cache
+                else EncoderDecoderCache(DynamicCache(), DynamicCache())
+            )
+
+    def _supports_num_logits_to_keep(self) -> bool:
+        """
+        Return True if the current model supports the keyword argument `num_logits_to_keep` in forward()
+        to save memory. Checking it in this way allows to avoid using a new model attribute.
+        """
+        return "num_logits_to_keep" in set(inspect.signature(self.forward).parameters.keys())
+
+    def _prepare_special_tokens(
+        self,
+        generation_config: GenerationConfig,
+        kwargs_has_attention_mask: Optional[bool] = None,
+        device: Optional[Union[torch.device, str]] = None,
+    ):
+        """
+        Prepares the special tokens for generation, overwriting the generation config with their processed versions
+        converted to tensor.
+
+        Note that `generation_config` is changed in place and stops being serializable after this method is called.
+        That is no problem if called within `generate` (`generation_config` is a local copy that doesn't leave the
+        function). However, if called outside `generate`, consider creating a copy of `generation_config` first.
+        """
+
+        # Convert special tokens to tensors
+        def _tensor_or_none(token, device=None):
+            if token is None:
+                return token
+
+            device = device if device is not None else self.device
+            if isinstance(token, torch.Tensor):
+                return token.to(device)
+            return torch.tensor(token, device=device, dtype=torch.long)
+
+        bos_token_tensor = _tensor_or_none(generation_config.bos_token_id, device=device)
+        eos_token_tensor = _tensor_or_none(generation_config.eos_token_id, device=device)
+        pad_token_tensor = _tensor_or_none(generation_config.pad_token_id, device=device)
+        decoder_start_token_tensor = _tensor_or_none(generation_config.decoder_start_token_id, device=device)
+
+        # for BC we also try to get `decoder_start_token_id` or `bos_token_id` (#30892)
+        if self.config.is_encoder_decoder:
+            decoder_start_token_tensor = (
+                decoder_start_token_tensor if decoder_start_token_tensor is not None else bos_token_tensor
+            )
+
+        # We can have more than one eos token. Always treat it as a 1D tensor (when it exists).
+        if eos_token_tensor is not None and eos_token_tensor.ndim == 0:
+            eos_token_tensor = eos_token_tensor.unsqueeze(0)
+
+        # Set pad token if unset (and there are conditions to do so)
+        if pad_token_tensor is None and eos_token_tensor is not None:
+            if not is_torchdynamo_compiling():
+                if kwargs_has_attention_mask is not None and not kwargs_has_attention_mask:
+                    logger.warning(
+                        "The attention mask and the pad token id were not set. As a consequence, you may observe "
+                        "unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results."
+                    )
+            pad_token_tensor = eos_token_tensor[0]
+            logger.warning(f"Setting `pad_token_id` to `eos_token_id`:{pad_token_tensor} for open-end generation.")
+
+        # Sanity checks/warnings
+        if self.config.is_encoder_decoder and decoder_start_token_tensor is None:
+            raise ValueError(
+                "`decoder_start_token_id` or `bos_token_id` has to be defined for encoder-decoder generation."
+            )
+        if not is_torchdynamo_compiling():  # Checks that depend on tensor-dependent control flow
+            if (
+                eos_token_tensor is not None
+                and isin_mps_friendly(elements=eos_token_tensor, test_elements=pad_token_tensor).any()
+            ):
+                if kwargs_has_attention_mask is not None and not kwargs_has_attention_mask:
+                    logger.warning_once(
+                        "The attention mask is not set and cannot be inferred from input because pad token is same as "
+                        "eos token. As a consequence, you may observe unexpected behavior. Please pass your input's "
+                        "`attention_mask` to obtain reliable results."
+                    )
+            if eos_token_tensor is not None and (
+                torch.is_floating_point(eos_token_tensor) or (eos_token_tensor < 0).any()
+            ):
+                logger.warning(
+                    f"`eos_token_id` should consist of positive integers, but is {eos_token_tensor}. Your generation "
+                    "will not stop until the maximum length is reached. Depending on other flags, it may even crash."
+                )
+
+        # Update generation config with the updated special tokens tensors
+        # NOTE: this must be written into a different attribute name than the one holding the original special tokens
+        # (in their non-tensor form), in order to enable end-to-end compilation. See
+        # https://pytorch.org/docs/stable/torch.compiler_cudagraph_trees.html#limitations
+        generation_config._bos_token_tensor = bos_token_tensor
+        generation_config._eos_token_tensor = eos_token_tensor
+        generation_config._pad_token_tensor = pad_token_tensor
+        generation_config._decoder_start_token_tensor = decoder_start_token_tensor
+
+    @torch.no_grad()
+    def generate(
+        self,
+        inputs: Optional[torch.Tensor] = None,
+        generation_config: Optional[GenerationConfig] = None,
+        logits_processor: Optional[LogitsProcessorList] = None,
+        stopping_criteria: Optional[StoppingCriteriaList] = None,
+        prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor], List[int]]] = None,
+        synced_gpus: Optional[bool] = None,
+        assistant_model: Optional["PreTrainedModel"] = None,
+        streamer: Optional["BaseStreamer"] = None,
+        negative_prompt_ids: Optional[torch.Tensor] = None,
+        negative_prompt_attention_mask: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> Union[GenerateOutput, torch.LongTensor]:
+        r"""
+
+        Generates sequences of token ids for models with a language modeling head.
+
+        <Tip warning={true}>
+
+        Most generation-controlling parameters are set in `generation_config` which, if not passed, will be set to the
+        model's default generation configuration. You can override any `generation_config` by passing the corresponding
+        parameters to generate(), e.g. `.generate(inputs, num_beams=4, do_sample=True)`.
+
+        For an overview of generation strategies and code examples, check out the [following
+        guide](../generation_strategies).
+
+        </Tip>
+
+        Parameters:
+            inputs (`torch.Tensor` of varying shape depending on the modality, *optional*):
+                The sequence used as a prompt for the generation or as model inputs to the encoder. If `None` the
+                method initializes it with `bos_token_id` and a batch size of 1. For decoder-only models `inputs`
+                should be in the format of `input_ids`. For encoder-decoder models *inputs* can represent any of
+                `input_ids`, `input_values`, `input_features`, or `pixel_values`.
+            generation_config ([`~generation.GenerationConfig`], *optional*):
+                The generation configuration to be used as base parametrization for the generation call. `**kwargs`
+                passed to generate matching the attributes of `generation_config` will override them. If
+                `generation_config` is not provided, the default will be used, which has the following loading
+                priority: 1) from the `generation_config.json` model file, if it exists; 2) from the model
+                configuration. Please note that unspecified parameters will inherit [`~generation.GenerationConfig`]'s
+                default values, whose documentation should be checked to parameterize generation.
+            logits_processor (`LogitsProcessorList`, *optional*):
+                Custom logits processors that complement the default logits processors built from arguments and
+                generation config. If a logit processor is passed that is already created with the arguments or a
+                generation config an error is thrown. This feature is intended for advanced users.
+            stopping_criteria (`StoppingCriteriaList`, *optional*):
+                Custom stopping criteria that complements the default stopping criteria built from arguments and a
+                generation config. If a stopping criteria is passed that is already created with the arguments or a
+                generation config an error is thrown. If your stopping criteria depends on the `scores` input, make
+                sure you pass `return_dict_in_generate=True, output_scores=True` to `generate`. This feature is
+                intended for advanced users.
+            prefix_allowed_tokens_fn (`Callable[[int, torch.Tensor], List[int]]`, *optional*):
+                If provided, this function constraints the beam search to allowed tokens only at each step. If not
+                provided no constraint is applied. This function takes 2 arguments: the batch ID `batch_id` and
+                `input_ids`. It has to return a list with the allowed tokens for the next generation step conditioned
+                on the batch ID `batch_id` and the previously generated tokens `inputs_ids`. This argument is useful
+                for constrained generation conditioned on the prefix, as described in [Autoregressive Entity
+                Retrieval](https://arxiv.org/abs/2010.00904).
+            synced_gpus (`bool`, *optional*):
+                Whether to continue running the while loop until max_length. Unless overridden, this flag will be set
+                to `True` if using `FullyShardedDataParallel` or DeepSpeed ZeRO Stage 3 with multiple GPUs to avoid
+                deadlocking if one GPU finishes generating before other GPUs. Otherwise, defaults to `False`.
+            assistant_model (`PreTrainedModel`, *optional*):
+                An assistant model that can be used to accelerate generation. The assistant model must have the exact
+                same tokenizer. The acceleration is achieved when forecasting candidate tokens with the assistant model
+                is much faster than running generation with the model you're calling generate from. As such, the
+                assistant model should be much smaller.
+            streamer (`BaseStreamer`, *optional*):
+                Streamer object that will be used to stream the generated sequences. Generated tokens are passed
+                through `streamer.put(token_ids)` and the streamer is responsible for any further processing.
+            negative_prompt_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                The negative prompt needed for some processors such as CFG. The batch size must match the input batch
+                size. This is an experimental feature, subject to breaking API changes in future versions.
+            negative_prompt_attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Attention_mask for `negative_prompt_ids`.
+            kwargs (`Dict[str, Any]`, *optional*):
+                Ad hoc parametrization of `generation_config` and/or additional model-specific kwargs that will be
+                forwarded to the `forward` function of the model. If the model is an encoder-decoder model, encoder
+                specific kwargs should not be prefixed and decoder specific kwargs should be prefixed with *decoder_*.
+
+        Return:
+            [`~utils.ModelOutput`] or `torch.LongTensor`: A [`~utils.ModelOutput`] (if `return_dict_in_generate=True`
+            or when `config.return_dict_in_generate=True`) or a `torch.LongTensor`.
+
+                If the model is *not* an encoder-decoder model (`model.config.is_encoder_decoder=False`), the possible
+                [`~utils.ModelOutput`] types are:
+
+                    - [`~generation.GenerateDecoderOnlyOutput`],
+                    - [`~generation.GenerateBeamDecoderOnlyOutput`]
+
+                If the model is an encoder-decoder model (`model.config.is_encoder_decoder=True`), the possible
+                [`~utils.ModelOutput`] types are:
+
+                    - [`~generation.GenerateEncoderDecoderOutput`],
+                    - [`~generation.GenerateBeamEncoderDecoderOutput`]
+        """
+
+        # 1. Handle `generation_config` and kwargs that might update it, and validate the `.generate()` call
+        self._validate_model_class()
+        tokenizer = kwargs.pop("tokenizer", None)  # Pull this out first, we only use it for stopping criteria
+        assistant_tokenizer = kwargs.pop("assistant_tokenizer", None)  # only used for assisted generation
+
+        generation_config, model_kwargs = self._prepare_generation_config(generation_config, **kwargs)
+        self._validate_model_kwargs(model_kwargs.copy())
+        self._validate_assistant(assistant_model, tokenizer, assistant_tokenizer)
+
+        # 2. Set generation parameters if not already defined
+        if synced_gpus is None:
+            synced_gpus = (is_deepspeed_zero3_enabled() or is_fsdp_managed_module(self)) and dist.get_world_size() > 1
+
+        logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
+        stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
+
+        accepts_attention_mask = "attention_mask" in set(inspect.signature(self.forward).parameters.keys())
+        requires_attention_mask = "encoder_outputs" not in model_kwargs
+        kwargs_has_attention_mask = model_kwargs.get("attention_mask", None) is not None
+
+        # 3. Define model inputs
+        inputs_tensor, model_input_name, model_kwargs = self._prepare_model_inputs(
+            inputs, generation_config.bos_token_id, model_kwargs
+        )
+        batch_size = inputs_tensor.shape[0]
+
+        device = inputs_tensor.device
+        self._prepare_special_tokens(generation_config, kwargs_has_attention_mask, device=device)
+
+        # decoder-only models must use left-padding for batched generation.
+        if not self.config.is_encoder_decoder and not is_torchdynamo_compiling():
+            # If `input_ids` was given, check if the last id in any sequence is `pad_token_id`
+            # Note: If using, `inputs_embeds` this check does not work, because we want to be more hands-off.
+            if (
+                generation_config._pad_token_tensor is not None
+                and batch_size > 1
+                and len(inputs_tensor.shape) == 2
+                and torch.sum(inputs_tensor[:, -1] == generation_config._pad_token_tensor) > 0
+            ):
+                logger.warning(
+                    "A decoder-only architecture is being used, but right-padding was detected! For correct "
+                    "generation results, please set `padding_side='left'` when initializing the tokenizer."
+                )
+
+        # 4. Define other model kwargs
+        # decoder-only models with inputs_embeds forwarding must use caching (otherwise we can't detect whether we are
+        # generating the first new token or not, and we only want to use the embeddings for the first new token)
+        if not self.config.is_encoder_decoder and model_input_name == "inputs_embeds":
+            generation_config.use_cache = True
+
+        if not kwargs_has_attention_mask and requires_attention_mask and accepts_attention_mask:
+            model_kwargs["attention_mask"] = self._prepare_attention_mask_for_generation(
+                inputs_tensor, generation_config, model_kwargs
+            )
+        elif kwargs_has_attention_mask:
+            # TODO (joao): generalize this check with other types of inputs
+            if model_input_name == "input_ids" and len(model_kwargs["attention_mask"].shape) > 2:
+                raise ValueError("`attention_mask` passed to `generate` must be 2D.")
+
+        if self.config.is_encoder_decoder and "encoder_outputs" not in model_kwargs:
+            # if model is encoder decoder encoder_outputs are created and added to `model_kwargs`
+            model_kwargs = self._prepare_encoder_decoder_kwargs_for_generation(
+                inputs_tensor, model_kwargs, model_input_name, generation_config
+            )
+
+        # 5. Prepare `input_ids` which will be used for auto-regressive generation
+        if self.config.is_encoder_decoder:
+            input_ids, model_kwargs = self._prepare_decoder_input_ids_for_generation(
+                batch_size=batch_size,
+                model_input_name=model_input_name,
+                model_kwargs=model_kwargs,
+                decoder_start_token_id=generation_config._decoder_start_token_tensor,
+                device=inputs_tensor.device,
+            )
+        else:
+            input_ids = inputs_tensor if model_input_name == "input_ids" else model_kwargs.pop("input_ids")
+
+        if generation_config.token_healing:
+            input_ids = self.heal_tokens(input_ids, tokenizer)
+
+        if streamer is not None:
+            streamer.put(input_ids.cpu())
+
+        # 6. Prepare `max_length` depending on other stopping criteria.
+        input_ids_length = input_ids.shape[-1]
+        has_default_max_length = kwargs.get("max_length") is None and generation_config.max_length is not None
+        has_default_min_length = kwargs.get("min_length") is None and generation_config.min_length is not None
+        generation_config = self._prepare_generated_length(
+            generation_config=generation_config,
+            has_default_max_length=has_default_max_length,
+            has_default_min_length=has_default_min_length,
+            model_input_name=model_input_name,
+            inputs_tensor=inputs_tensor,
+            input_ids_length=input_ids_length,
+        )
+
+        # If the model supports `num_logits_to_keep` in forward(), set it to 1 to avoid computing the whole
+        # logit matrix. This can save a lot of memory during the first forward pass. Note that assisted decoding
+        # dynamically overrides this value as it can need more than the last token logits
+        if self._supports_num_logits_to_keep() and "num_logits_to_keep" not in model_kwargs:
+            model_kwargs["num_logits_to_keep"] = 1
+
+        self._validate_generated_length(generation_config, input_ids_length, has_default_max_length)
+
+        # 7. Prepare the cache.
+        # - `model_kwargs` may be updated in place with a cache as defined by the parameters in `generation_config`.
+        # - different models have a different cache name expected by the model (default = "past_key_values")
+        # - `max_length`, prepared above, is used to determine the maximum cache length
+        # TODO (joao): remove `user_defined_cache` after v4.47 (remove default conversion to legacy format)
+        cache_name = "past_key_values" if "mamba" not in self.__class__.__name__.lower() else "cache_params"
+        user_defined_cache = model_kwargs.get(cache_name)
+        max_cache_length = generation_config.max_length
+        if (
+            inputs_tensor.shape[1] != input_ids_length
+            and model_input_name == "inputs_embeds"
+            and not self.config.is_encoder_decoder
+        ):
+            max_cache_length += inputs_tensor.shape[1]
+        self._prepare_cache_for_generation(
+            generation_config, model_kwargs, assistant_model, batch_size, max_cache_length, device
+        )
+
+        # 8. determine generation mode
+        generation_mode = generation_config.get_generation_mode(assistant_model)
+
+        if streamer is not None and (generation_config.num_beams > 1):
+            raise ValueError(
+                "`streamer` cannot be used with beam search (yet!). Make sure that `num_beams` is set to 1."
+            )
+
+        if not is_torchdynamo_compiling() and self.device.type != input_ids.device.type:
+            warnings.warn(
+                "You are calling .generate() with the `input_ids` being on a device type different"
+                f" than your model's device. `input_ids` is on {input_ids.device.type}, whereas the model"
+                f" is on {self.device.type}. You may experience unexpected behaviors or slower generation."
+                " Please make sure that you have put `input_ids` to the"
+                f" correct device by calling for example input_ids = input_ids.to('{self.device.type}') before"
+                " running `.generate()`.",
+                UserWarning,
+            )
+
+        # 9. prepare logits processors and stopping criteria
+        prepared_logits_processor = self._get_logits_processor(
+            generation_config=generation_config,
+            input_ids_seq_length=input_ids_length,
+            encoder_input_ids=inputs_tensor,
+            prefix_allowed_tokens_fn=prefix_allowed_tokens_fn,
+            logits_processor=logits_processor,
+            device=inputs_tensor.device,
+            model_kwargs=model_kwargs,
+            negative_prompt_ids=negative_prompt_ids,
+            negative_prompt_attention_mask=negative_prompt_attention_mask,
+        )
+        prepared_stopping_criteria = self._get_stopping_criteria(
+            generation_config=generation_config, stopping_criteria=stopping_criteria, tokenizer=tokenizer, **kwargs
+        )
+
+        # Set model_kwargs `use_cache` so we can use it later in forward runs
+        model_kwargs["use_cache"] = generation_config.use_cache
+
+        # 10. go into different generation modes
+        if generation_mode == GenerationMode.ASSISTED_GENERATION:
+            if generation_config.num_return_sequences > 1:
+                raise ValueError(
+                    "num_return_sequences has to be 1 when doing assisted generate, "
+                    f"but is {generation_config.num_return_sequences}."
+                )
+            if batch_size > 1:
+                raise ValueError("assisted generate is only supported for batch_size = 1")
+            if not model_kwargs["use_cache"]:
+                raise ValueError("assisted generate requires `use_cache=True`")
+            if generation_config.cache_implementation in ["static", "hybrid", "sliding_window"]:
+                raise ValueError("assisted generate is not supported with Static cache classes`")
+            if self._is_stateful:
+                # In assisted generation we need the ability to confirm whether the model would pick certain tokens,
+                # which is not possible with stateful models (they can't reset to a previous subset of generated text)
+                raise ValueError(
+                    f"assisted generation is not supported with stateful models, such as {self.__class__.__name__}"
+                )
+
+            # 11. Get the candidate generator, given the parameterization
+            candidate_generator = self._get_candidate_generator(
+                generation_config=generation_config,
+                input_ids=input_ids,
+                inputs_tensor=inputs_tensor,
+                assistant_model=assistant_model,
+                logits_processor=logits_processor,
+                target_tokenizer=tokenizer,
+                assistant_tokenizer=assistant_tokenizer,
+                model_kwargs=model_kwargs,
+            )
+
+            # 12. run assisted generate
+            result = self._assisted_decoding(
+                input_ids,
+                candidate_generator=candidate_generator,
+                logits_processor=prepared_logits_processor,
+                stopping_criteria=prepared_stopping_criteria,
+                generation_config=generation_config,
+                synced_gpus=synced_gpus,
+                streamer=streamer,
+                **model_kwargs,
+            )
+        elif generation_mode == GenerationMode.DOLA_GENERATION:
+            if self._is_stateful:
+                # DoLa decoding was not designed for stateful models, and would require some changes
+                raise ValueError(
+                    f"dola decoding is not supported with stateful models, such as {self.__class__.__name__}"
+                )
+            result = self._dola_decoding(
+                input_ids,
+                dola_layers=generation_config.dola_layers,
+                logits_processor=prepared_logits_processor,
+                stopping_criteria=prepared_stopping_criteria,
+                generation_config=generation_config,
+                synced_gpus=synced_gpus,
+                streamer=streamer,
+                **model_kwargs,
+            )
+
+        elif generation_mode == GenerationMode.CONTRASTIVE_SEARCH:
+            if not model_kwargs["use_cache"]:
+                raise ValueError("Contrastive search requires `use_cache=True`")
+            if self._is_stateful:
+                # Just like assisted generation, we need to be able to rollback to a previous state (see comment above)
+                raise ValueError(
+                    f"contrastive search is not supported with stateful models, such as {self.__class__.__name__}"
+                )
+
+            result = self._contrastive_search(
+                input_ids,
+                logits_processor=prepared_logits_processor,
+                stopping_criteria=prepared_stopping_criteria,
+                generation_config=generation_config,
+                synced_gpus=synced_gpus,
+                streamer=streamer,
+                **model_kwargs,
+            )
+
+        elif generation_mode in (GenerationMode.SAMPLE, GenerationMode.GREEDY_SEARCH):
+            # 11. expand input_ids with `num_return_sequences` additional sequences per batch
+            input_ids, model_kwargs = self._expand_inputs_for_generation(
+                input_ids=input_ids,
+                expand_size=generation_config.num_return_sequences,
+                is_encoder_decoder=self.config.is_encoder_decoder,
+                **model_kwargs,
+            )
+
+            # 12. run sample (it degenerates to greedy search when `generation_config.do_sample=False`)
+            result = self._sample(
+                input_ids,
+                logits_processor=prepared_logits_processor,
+                stopping_criteria=prepared_stopping_criteria,
+                generation_config=generation_config,
+                synced_gpus=synced_gpus,
+                streamer=streamer,
+                **model_kwargs,
+            )
+
+        elif generation_mode in (GenerationMode.BEAM_SAMPLE, GenerationMode.BEAM_SEARCH):
+            # 11. prepare beam search scorer
+            beam_scorer = BeamSearchScorer(
+                batch_size=batch_size,
+                num_beams=generation_config.num_beams,
+                device=inputs_tensor.device,
+                length_penalty=generation_config.length_penalty,
+                do_early_stopping=generation_config.early_stopping,
+                num_beam_hyps_to_keep=generation_config.num_return_sequences,
+                max_length=generation_config.max_length,
+            )
+
+            # 12. interleave input_ids with `num_beams` additional sequences per batch
+            input_ids, model_kwargs = self._expand_inputs_for_generation(
+                input_ids=input_ids,
+                expand_size=generation_config.num_beams,
+                is_encoder_decoder=self.config.is_encoder_decoder,
+                **model_kwargs,
+            )
+
+            # 13. run beam sample
+            result = self._beam_search(
+                input_ids,
+                beam_scorer,
+                logits_processor=prepared_logits_processor,
+                stopping_criteria=prepared_stopping_criteria,
+                generation_config=generation_config,
+                synced_gpus=synced_gpus,
+                **model_kwargs,
+            )
+
+        elif generation_mode == GenerationMode.GROUP_BEAM_SEARCH:
+            # 11. prepare beam search scorer
+            beam_scorer = BeamSearchScorer(
+                batch_size=batch_size,
+                num_beams=generation_config.num_beams,
+                device=inputs_tensor.device,
+                length_penalty=generation_config.length_penalty,
+                do_early_stopping=generation_config.early_stopping,
+                num_beam_hyps_to_keep=generation_config.num_return_sequences,
+                num_beam_groups=generation_config.num_beam_groups,
+                max_length=generation_config.max_length,
+            )
+            # 12. interleave input_ids with `num_beams` additional sequences per batch
+            input_ids, model_kwargs = self._expand_inputs_for_generation(
+                input_ids=input_ids,
+                expand_size=generation_config.num_beams,
+                is_encoder_decoder=self.config.is_encoder_decoder,
+                **model_kwargs,
+            )
+            # 13. run beam search
+            result = self._group_beam_search(
+                input_ids,
+                beam_scorer,
+                logits_processor=prepared_logits_processor,
+                stopping_criteria=prepared_stopping_criteria,
+                generation_config=generation_config,
+                synced_gpus=synced_gpus,
+                **model_kwargs,
+            )
+
+        elif generation_mode == GenerationMode.CONSTRAINED_BEAM_SEARCH:
+            final_constraints = []
+            if generation_config.constraints is not None:
+                final_constraints = generation_config.constraints
+
+            if generation_config.force_words_ids is not None:
+
+                def typeerror():
+                    raise ValueError(
+                        "`force_words_ids` has to either be a `List[List[List[int]]]` or `List[List[int]]` "
+                        f"of positive integers, but is {generation_config.force_words_ids}."
+                    )
+
+                if (
+                    not isinstance(generation_config.force_words_ids, list)
+                    or len(generation_config.force_words_ids) == 0
+                ):
+                    typeerror()
+
+                for word_ids in generation_config.force_words_ids:
+                    if isinstance(word_ids[0], list):
+                        if not isinstance(word_ids, list) or len(word_ids) == 0:
+                            typeerror()
+                        if any(not isinstance(token_ids, list) for token_ids in word_ids):
+                            typeerror()
+                        if any(
+                            any((not isinstance(token_id, int) or token_id < 0) for token_id in token_ids)
+                            for token_ids in word_ids
+                        ):
+                            typeerror()
+
+                        constraint = DisjunctiveConstraint(word_ids)
+                    else:
+                        if not isinstance(word_ids, list) or len(word_ids) == 0:
+                            typeerror()
+                        if any((not isinstance(token_id, int) or token_id < 0) for token_id in word_ids):
+                            typeerror()
+
+                        constraint = PhrasalConstraint(word_ids)
+                    final_constraints.append(constraint)
+
+            # 11. prepare beam search scorer
+            constrained_beam_scorer = ConstrainedBeamSearchScorer(
+                constraints=final_constraints,
+                batch_size=batch_size,
+                num_beams=generation_config.num_beams,
+                device=inputs_tensor.device,
+                length_penalty=generation_config.length_penalty,
+                do_early_stopping=generation_config.early_stopping,
+                num_beam_hyps_to_keep=generation_config.num_return_sequences,
+                max_length=generation_config.max_length,
+            )
+            # 12. interleave input_ids with `num_beams` additional sequences per batch
+            input_ids, model_kwargs = self._expand_inputs_for_generation(
+                input_ids=input_ids,
+                expand_size=generation_config.num_beams,
+                is_encoder_decoder=self.config.is_encoder_decoder,
+                **model_kwargs,
+            )
+            # 13. run beam search
+            result = self._constrained_beam_search(
+                input_ids,
+                constrained_beam_scorer=constrained_beam_scorer,
+                logits_processor=prepared_logits_processor,
+                stopping_criteria=prepared_stopping_criteria,
+                generation_config=generation_config,
+                synced_gpus=synced_gpus,
+                **model_kwargs,
+            )
+
+        # Convert to legacy cache format if requested
+        if (
+            generation_config.return_legacy_cache is not False  # Should check for `True` after v4.47
+            and not is_torchdynamo_compiling()
+            and hasattr(result, "past_key_values")
+            and hasattr(result.past_key_values, "to_legacy_cache")
+            and result.past_key_values.to_legacy_cache is not None
+        ):
+            # handle BC (convert by default if he user hasn't passed a cache AND the cache is of the default type)
+            should_convert_cache = generation_config.return_legacy_cache
+            is_user_defined_cache = user_defined_cache is not None
+            is_default_cache_type = (
+                type(result.past_key_values) == DynamicCache  # noqa E721
+                or (
+                    isinstance(result.past_key_values, EncoderDecoderCache)
+                    and type(result.past_key_values.self_attention_cache) == DynamicCache  # noqa E721
+                    and type(result.past_key_values.cross_attention_cache) == DynamicCache  # noqa E721
+                )
+            )
+            if not is_user_defined_cache and is_default_cache_type:
+                logger.warning_once(
+                    "From v4.47 onwards, when a model cache is to be returned, `generate` will return a `Cache` "
+                    "instance instead by default (as opposed to the legacy tuple of tuples format). If you want to "
+                    "keep returning the legacy format, please set `return_legacy_cache=True`."
+                )
+                should_convert_cache = True
+            if should_convert_cache:
+                result.past_key_values = result.past_key_values.to_legacy_cache()
+        return result
+
+    def _has_unfinished_sequences(
+        self,
+        this_peer_finished: bool,
+        synced_gpus: bool,
+        device: torch.device,
+        cur_len: Optional[int] = None,
+        max_length: Optional[int] = None,
+    ) -> bool:
+        """
+        Returns whether there are still unfinished sequences in the device. The existence of unfinished sequences is
+        fed through `this_peer_finished`. ZeRO stage 3-friendly.
+        """
+        # torch.compile does not support data-dependent control flow. This is a workaround to allow torch.compile,
+        # although we lose the ability to stop when all sequences return an EOS token (and other stopping criteria)
+        # TODO (joao): remove this when torch's support for control flow is not experimental (https://pytorch.org/docs/stable/generated/torch.cond.html)
+        if is_torchdynamo_compiling():
+            return cur_len < max_length
+        else:
+            if synced_gpus:
+                # Under synced_gpus the `forward` call must continue until all gpus complete their sequence.
+                # The following logic allows an early break if all peers finished generating their sequence
+                this_peer_finished_flag = torch.tensor(0.0 if this_peer_finished else 1.0).to(device)
+                # send 0.0 if we finished, 1.0 otherwise
+                dist.all_reduce(this_peer_finished_flag, op=dist.ReduceOp.SUM)
+                # did all peers finish? the reduced sum will be 0.0 then
+                if this_peer_finished_flag.item() == 0.0:
+                    return False
+            elif this_peer_finished:
+                return False
+            return True
+
+    def heal_tokens(
+        self, input_ids: torch.LongTensor, tokenizer: Optional["PreTrainedTokenizerBase"] = None
+    ) -> torch.LongTensor:
+        r"""
+        Generates sequences of token ids for models with a language modeling head.
+        Parameters:
+            input_ids (`torch.LongTensor`): The sequence used as a prompt for the generation.
+            tokenizer (`PreTrainedTokenizerBase`, *optional*): The tokenizer used to decode the input ids.
+        Return:
+            `torch.LongTensor` where each sequence has its tail token replaced with its appropriate extension.
+        """
+        if tokenizer is None:
+            raise ValueError(
+                " When generating with token healing, you must pass the model's tokenizer to the `tokenizer` "
+                "argument of `generate`."
+            )
+
+        bos_token_id, pad_token_id = tokenizer.bos_token_id, tokenizer.pad_token_id
+        vocab_trie = ExtensionsTrie(tokenizer.get_vocab())
+        generation_config = GenerationConfig(max_new_tokens=1, pad_token_id=pad_token_id)
+
+        # assumption: leading/trailing whitespace is not meaningful, so the prompts are
+        # stripped before re-tokenizing to desensitize generation to whitespace artefacts
+        prompts = [p.strip() for p in tokenizer.batch_decode(input_ids, skip_special_tokens=True)]
+        input_ids = tokenizer(
+            prompts,
+            return_tensors="pt",
+            padding=True,
+        ).input_ids.to(input_ids.device)
+
+        # replace bos with pad to not condition healing on it
+        input_ids = torch.where(input_ids == bos_token_id, pad_token_id, input_ids)
+
+        """
+        the latter code assumes the input_ids is not empty,
+        input_id has to be checked if contains elements
+		"""
+        if input_ids.numel() == 0:
+            return input_ids
+
+        tail_ids = input_ids[:, -1].tolist()
+
+        space_tok = tokenizer.convert_ids_to_tokens(tokenizer.convert_tokens_to_ids(" "))[0]
+        # tail tokens are used for a prefix search, thus, whitespaces are replaced with
+        # their tokenization (e.g. 'Ġ') to enable search for tokens prefixed with a whitespace
+        tail_toks = (tokenizer.decode(t).replace(" ", space_tok) for t in tail_ids)
+
+        for batch_idx, (tail_id, tail_tok) in enumerate(zip(tail_ids, tail_toks)):
+            batch_ids = input_ids[batch_idx]
+            if torch.all(batch_ids == pad_token_id).item():
+                continue  # skip empty sequences (all pad ids)
+
+            # apply bias for alternatives (extensions) to the tail token
+            """
+            seq_bias key has to be tuple with int so have to use
+            tokenizer function to convert str to int
+			"""
+            seq_bias = {
+                (tokenizer.convert_tokens_to_ids(alt_tok),): 10.0 for alt_tok in vocab_trie.extensions(prefix=tail_tok)
+            }
+
+            if len(seq_bias) == 1:
+                continue  # skip if there are no token alternatives to heal with
+
+            # slightly favor original token to limit aggressive healing e.g. 'http' -> 'https'
+            seq_bias[(tail_id,)] += 1.0
+            generation_config.update(sequence_bias=seq_bias)
+
+            trimmed_ids = batch_ids[:-1]
+
+            """
+            the latter code assumes trimmed_ids is not empty
+            so have to check the its element count
+			"""
+            if trimmed_ids.numel() == 0:
+                continue
+
+            # if the prompt is a single (non-pad) token, regenerate from bos
+            if len(batch_ids[batch_ids != pad_token_id]) == 1:
+                trimmed_ids[-1] = bos_token_id
+
+            input_ids[batch_idx] = self.generate(trimmed_ids.unsqueeze(0), generation_config=generation_config)
+
+        return input_ids
+
+    def _dola_decoding(
+        self,
+        input_ids: torch.LongTensor,
+        dola_layers: Union[str, List[int]],
+        logits_processor: LogitsProcessorList,
+        stopping_criteria: StoppingCriteriaList,
+        generation_config: GenerationConfig,
+        synced_gpus: bool,
+        streamer: "BaseStreamer",
+        **model_kwargs,
+    ) -> Union[GenerateNonBeamOutput, torch.LongTensor]:
+        r"""
+        Generates sequences of token ids for models with a language modeling head using **dola decoding** and can be
+        used for decoder-only text models.
+        The method is based on the paper "DoLa: Decoding by Contrasting Layers Improves Factuality in Large Language
+        Models" (https://arxiv.org/abs/2309.03883) in ICLR 2024.
+
+        Parameters:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                The sequence used as a prompt for the generation.
+            dola_layers (`Union[str, List[int]]`):
+                The candidate layers used in contrasting layers of DoLa. It can be either 1) 'low' or 'high', which
+                means the lower part or higher part of the model layers, respectively, or 2) a list of layer indices
+                to be used for candidate layers. The 0-th layer is the word embedding layer of the model.
+            logits_processor (`LogitsProcessorList`):
+                An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`]
+                used to modify the prediction scores of the language modeling head applied at each generation step.
+            stopping_criteria (`StoppingCriteriaList`, *optional*):
+                An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`]
+                used to tell if the generation loop should stop.
+            generation_config ([`~generation.GenerationConfig`]):
+                The generation configuration to be used as parametrization of the decoding method.
+            synced_gpus (`bool`):
+                Whether to continue running the while loop until max_length (needed to avoid deadlocking with
+                `FullyShardedDataParallel` and DeepSpeed ZeRO Stage 3).
+            streamer (`BaseStreamer`, *optional*):
+                Streamer object that will be used to stream the generated sequences. Generated tokens are passed
+                through `streamer.put(token_ids)` and the streamer is responsible for any further processing.
+            model_kwargs:
+                Additional model specific keyword arguments will be forwarded to the `forward` function of the model.
+                If model is an encoder-decoder model the kwargs should include `encoder_outputs`.
+
+        Return:
+            [`~generation.GenerateDecoderOnlyOutput`], [`~generation.GenerateEncoderDecoderOutput`]
+            or `torch.LongTensor`: A `torch.LongTensor` containing the generated tokens (default behaviour) or a
+            [`~generation.GenerateDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and
+            `return_dict_in_generate=True` or a [`~generation.GenerateEncoderDecoderOutput`] if
+            `model.config.is_encoder_decoder=True`.
+        """
+
+        if self.config.is_encoder_decoder:
+            raise ValueError("DoLa decoding is only available for decoder-only models.")
+        # init values
+
+        pad_token_id = generation_config._pad_token_tensor
+        output_attentions = generation_config.output_attentions
+        output_hidden_states = generation_config.output_hidden_states
+        output_scores = generation_config.output_scores
+        output_logits = generation_config.output_logits
+        return_dict_in_generate = generation_config.return_dict_in_generate
+        has_eos_stopping_criteria = any(hasattr(criteria, "eos_token_id") for criteria in stopping_criteria)
+        do_sample = generation_config.do_sample
+
+        # init attention / hidden states / scores tuples
+        scores = () if (return_dict_in_generate and output_scores) else None
+        raw_logits = () if (return_dict_in_generate and output_logits) else None
+        decoder_attentions = () if (return_dict_in_generate and output_attentions) else None
+        cross_attentions = () if (return_dict_in_generate and output_attentions) else None
+        decoder_hidden_states = () if (return_dict_in_generate and output_hidden_states) else None
+
+        # keep track of which sequences are already finished
+        batch_size = input_ids.shape[0]
+        unfinished_sequences = torch.ones(batch_size, dtype=torch.long, device=input_ids.device)
+        model_kwargs = self._get_initial_cache_position(input_ids, model_kwargs)
+
+        this_peer_finished = False
+
+        # prepare layers for DoLa decoding
+        final_layer = self.config.get_text_config().num_hidden_layers
+        # if the model has tied word embeddings, we skip the word embeddings (0-th) layer and start from the 2nd layer,
+        # as the early exit from word embeddings will become identity function
+        # if the model is really shallow (<=2 layers), we use the 1st layer if it's not the final layer and the 0-th
+        # layer otherwise. Notice that DoLa does not help shallow models much.
+        if not self.config.tie_word_embeddings:
+            start_layer = 0
+        elif final_layer > 2:
+            start_layer = 2
+        elif final_layer == 2:
+            start_layer = 1
+        else:
+            start_layer = 0
+
+        # For `N`-layer models with `N <= 40` layers, the layers of `range(0, N // 2, 2)` and `range(N // 2, N, 2)`
+        # are used for `'low'` and `'high'` layers, respectively.
+        # For models with `N > 40` layers, the layers of `range(0, 20, 2)` and `range(N - 20, N, 2)` are used for
+        # `'low'` and `'high'` layers, respectively.
+        if isinstance(dola_layers, str) and dola_layers == "low":
+            if start_layer == final_layer // 2:
+                candidate_premature_layers = [start_layer]
+            else:
+                candidate_premature_layers = (
+                    list(range(start_layer, final_layer // 2, 2))
+                    if final_layer <= 40
+                    else list(range(start_layer, 20, 2))
+                )
+        elif isinstance(dola_layers, str) and dola_layers == "high":
+            candidate_premature_layers = (
+                list(range(final_layer // 2, final_layer, 2))
+                if final_layer <= 40
+                else list(range(final_layer - 20, final_layer, 2))
+            )
+        # Set the `dola_layers` to a list of integers for layer indices to contrast manually specified layers.
+        elif isinstance(dola_layers, list):
+            candidate_premature_layers = [i for i in dola_layers if i < final_layer]
+        else:
+            raise ValueError("dola_layers must be either 'low', 'high' or a list of integers.")
+
+        lm_head = self.get_output_embeddings()
+        if lm_head is None:
+            raise ValueError("DoLa is not supported for models that don't have output embeddings.")
+
+        while self._has_unfinished_sequences(this_peer_finished, synced_gpus, device=input_ids.device):
+            # prepare model inputs
+            model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
+
+            # forward pass to get next token
+            outputs = self(
+                **model_inputs,
+                return_dict=True,
+                output_attentions=output_attentions,
+                output_hidden_states=True,
+            )
+
+            # .float() is needed to retain precision for later logits manipulations
+            final_layer_next_token_logits = outputs.logits[:, -1, :].detach().clone().float()
+            final_logits = outputs.logits[:, -1, :].float()
+            candidate_premature_logits = {}
+            for candidate_premature_layer in candidate_premature_layers:
+                candidate_premature_logits[candidate_premature_layer] = lm_head(
+                    outputs.hidden_states[candidate_premature_layer][:, -1, :]
+                ).to(final_logits.device)
+
+            # synced_gpus: don't waste resources running the code we don't need; kwargs must be updated before skipping
+            model_kwargs = self._update_model_kwargs_for_generation(
+                outputs,
+                model_kwargs,
+                is_encoder_decoder=self.config.is_encoder_decoder,
+            )
+            if synced_gpus and this_peer_finished:
+                continue
+
+            next_token_logits = _dola_select_contrast(
+                candidate_premature_layers, candidate_premature_logits, final_logits
+            )
+            next_token_logits = next_token_logits.to(input_ids.device)
+            # pre-process distribution
+            next_token_scores = logits_processor(input_ids, next_token_logits)
+
+            # Store scores, attentions and hidden_states when required
+            if return_dict_in_generate:
+                if output_scores:
+                    scores += (next_token_scores,)
+                if output_logits:
+                    raw_logits += (final_layer_next_token_logits,)
+                if output_attentions:
+                    decoder_attentions += (
+                        (outputs.decoder_attentions,) if self.config.is_encoder_decoder else (outputs.attentions,)
+                    )
+                    if self.config.is_encoder_decoder:
+                        cross_attentions += (outputs.cross_attentions,)
+
+                if output_hidden_states:
+                    decoder_hidden_states += (
+                        (outputs.decoder_hidden_states,)
+                        if self.config.is_encoder_decoder
+                        else (outputs.hidden_states,)
+                    )
+
+            if do_sample:  # sample
+                probs = nn.functional.softmax(next_token_scores, dim=-1)
+                next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1)
+            else:  # argmax
+                next_tokens = torch.argmax(next_token_scores, dim=-1)
+
+            # finished sentences should have their next token be a padding token
+            if has_eos_stopping_criteria:
+                next_tokens = next_tokens * unfinished_sequences + pad_token_id * (1 - unfinished_sequences)
+
+            # update generated ids, model inputs, and length for next step
+            input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1)
+            if streamer is not None:
+                streamer.put(next_tokens.cpu())
+
+            # stop when each sentence is finished
+            unfinished_sequences = unfinished_sequences & ~stopping_criteria(input_ids, scores)
+            this_peer_finished = unfinished_sequences.max() == 0
+
+        if streamer is not None:
+            streamer.end()
+
+        if return_dict_in_generate:
+            return GenerateDecoderOnlyOutput(
+                sequences=input_ids,
+                scores=scores,
+                logits=raw_logits,
+                attentions=decoder_attentions,
+                hidden_states=decoder_hidden_states,
+                past_key_values=model_kwargs.get("past_key_values"),
+            )
+        else:
+            return input_ids
+
+    @torch.no_grad()
+    def _contrastive_search(
+        self,
+        input_ids: torch.LongTensor,
+        logits_processor: LogitsProcessorList,
+        stopping_criteria: StoppingCriteriaList,
+        generation_config: GenerationConfig,
+        synced_gpus: bool,
+        streamer: Optional["BaseStreamer"],
+        **model_kwargs,
+    ) -> Union[GenerateNonBeamOutput, torch.LongTensor]:
+        r"""
+        Generates sequences of token ids for models with a language modeling head using **contrastive search** and can
+        be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models.
+
+        Parameters:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                The sequence used as a prompt for the generation.
+            logits_processor (`LogitsProcessorList`):
+                An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`]
+                used to modify the prediction scores of the language modeling head applied at each generation step.
+            stopping_criteria (`StoppingCriteriaList`):
+                An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`]
+                used to tell if the generation loop should stop.
+            generation_config ([`~generation.GenerationConfig`]):
+                The generation configuration to be used as parametrization of the decoding method.
+            synced_gpus (`bool`):
+                Whether to continue running the while loop until max_length (needed to avoid deadlocking with
+                `FullyShardedDataParallel` and DeepSpeed ZeRO Stage 3).
+            streamer (`BaseStreamer`, *optional*):
+                Streamer object that will be used to stream the generated sequences. Generated tokens are passed
+                through `streamer.put(token_ids)` and the streamer is responsible for any further processing.
+            model_kwargs:
+                Additional model specific keyword arguments will be forwarded to the `forward` function of the model.
+                If model is an encoder-decoder model the kwargs should include `encoder_outputs`.
+
+        Return:
+            [`~generation.GenerateDecoderOnlyOutput`], [`~generation.GenerateEncoderDecoderOutput`]
+            or `torch.LongTensor`: A `torch.LongTensor` containing the generated tokens (default behaviour) or a
+            [`~generation.GenerateDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and
+            `return_dict_in_generate=True` or a [`~generation.GenerateEncoderDecoderOutput`] if
+            `model.config.is_encoder_decoder=True`.
+        """
+        # init values
+        has_eos_stopping_criteria = any(hasattr(criteria, "eos_token_id") for criteria in stopping_criteria)
+        top_k = generation_config.top_k
+        penalty_alpha = generation_config.penalty_alpha
+        pad_token_id = generation_config._pad_token_tensor
+        output_attentions = generation_config.output_attentions
+        output_hidden_states = generation_config.output_hidden_states
+        output_scores = generation_config.output_scores
+        output_logits = generation_config.output_logits
+        return_dict_in_generate = generation_config.return_dict_in_generate
+        sequential = generation_config.low_memory
+
+        # init attention / hidden states / scores tuples
+        raw_logits = () if (return_dict_in_generate and output_logits) else None
+        scores = () if (return_dict_in_generate and output_scores) else None
+        decoder_attentions = () if (return_dict_in_generate and output_attentions) else None
+        cross_attentions = () if (return_dict_in_generate and output_attentions) else None
+        decoder_hidden_states = () if (return_dict_in_generate and output_hidden_states) else None
+
+        # if model is an encoder-decoder, retrieve encoder attention weights and hidden states
+        if return_dict_in_generate and self.config.is_encoder_decoder:
+            encoder_attentions = model_kwargs["encoder_outputs"].get("attentions") if output_attentions else None
+            encoder_hidden_states = (
+                model_kwargs["encoder_outputs"].get("hidden_states") if output_hidden_states else None
+            )
+
+        # keep track of which sequences are already finished
+        batch_size = input_ids.shape[0]
+        unfinished_sequences = torch.ones(batch_size, dtype=torch.long, device=input_ids.device)
+        model_kwargs = self._get_initial_cache_position(input_ids, model_kwargs)
+
+        # Create cosine_matrix_mask based on the attention_mask
+        cosine_matrix_mask = torch.ones_like(input_ids, dtype=torch.long)
+        if self.config.is_encoder_decoder:
+            if "decoder_attention_mask" in model_kwargs and model_kwargs["decoder_attention_mask"] is not None:
+                cosine_matrix_mask = model_kwargs["decoder_attention_mask"]
+        else:
+            cosine_matrix_mask = model_kwargs["attention_mask"]
+        cosine_matrix_mask = cosine_matrix_mask.repeat_interleave(top_k, dim=0)
+
+        this_peer_finished = False
+
+        while self._has_unfinished_sequences(this_peer_finished, synced_gpus, device=input_ids.device):
+            # if the first step in the loop, encode all the prefix and obtain: (1) past_key_values;
+            # (2) last_hidden_states; (3) logit_for_next_step; (4) update model kwargs for the next step
+            if model_kwargs.get("past_key_values") is None or (
+                isinstance(model_kwargs["past_key_values"], (Cache, EncoderDecoderCache))
+                and model_kwargs["past_key_values"].get_seq_length() == 0
+            ):
+                # prepare inputs
+                model_kwargs["use_cache"] = True
+                model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
+
+                # encode the given prefix and prepare model inputs; encoder-decoder model process the prefix and save
+                # the `encoder_outputs`
+                outputs = self(
+                    **model_inputs, return_dict=True, output_hidden_states=True, output_attentions=output_attentions
+                )
+
+                # last decoder hidden states will be used to compute the degeneration penalty (cosine similarity with
+                # previous tokens)
+                if self.config.is_encoder_decoder:
+                    last_hidden_states = outputs.decoder_hidden_states[-1]
+                else:
+                    last_hidden_states = outputs.hidden_states[-1]
+
+                # next logit for contrastive search to select top-k candidate tokens
+                # Clone is needed to avoid keeping a hanging ref to outputs.logits which may be very large for this first iteration
+                # (the clone itself is always small)
+                # .float() is needed to retain precision for later logits manipulations
+                logit_for_next_step = outputs.logits[:, -1, :].clone().float()
+                logit_for_next_step = logit_for_next_step.to(input_ids.device)
+
+                model_kwargs = self._update_model_kwargs_for_generation(
+                    outputs,
+                    model_kwargs,
+                    is_encoder_decoder=self.config.is_encoder_decoder,
+                )
+
+                if not sequential:
+                    # Expands model inputs top_k times, for batched forward passes (akin to beam search).
+                    _, model_kwargs = self._expand_inputs_for_generation(
+                        expand_size=top_k, is_encoder_decoder=self.config.is_encoder_decoder, **model_kwargs
+                    )
+
+                past_key_values = model_kwargs.get("past_key_values")
+                if past_key_values is None:
+                    raise ValueError(
+                        f"{self.__class__.__name__} does not support caching and therefore **can't** be used "
+                        "for contrastive search."
+                    )
+                elif (
+                    not isinstance(past_key_values[0], (tuple, torch.Tensor))
+                    or past_key_values[0][0].shape[0] != batch_size
+                ):
+                    raise ValueError(
+                        f"{self.__class__.__name__} does not have a standard cache format and therefore **can't** be "
+                        "used for contrastive search without further modifications."
+                    )
+
+            # contrastive_search main logic start:
+            # contrastive search decoding consists of two steps: (1) candidate tokens recall; (2) candidate re-rank by
+            # degeneration penalty
+            processed_logit_for_next_step = logits_processor(input_ids, logit_for_next_step)
+            next_probs = nn.functional.softmax(processed_logit_for_next_step, dim=-1)
+
+            top_k_probs, top_k_ids = torch.topk(next_probs, dim=-1, k=top_k)
+
+            # Store scores, attentions and hidden_states when required
+            if return_dict_in_generate:
+                if output_logits:
+                    raw_logits += (logit_for_next_step,)
+                if output_scores:
+                    scores += (processed_logit_for_next_step,)
+                if output_attentions:
+                    decoder_attentions += (
+                        (outputs.decoder_attentions,) if self.config.is_encoder_decoder else (outputs.attentions,)
+                    )
+                    if self.config.is_encoder_decoder:
+                        cross_attentions += (outputs.cross_attentions,)
+
+                if output_hidden_states:
+                    decoder_hidden_states += (
+                        (outputs.decoder_hidden_states,)
+                        if self.config.is_encoder_decoder
+                        else (outputs.hidden_states,)
+                    )
+
+            # This is needed to properly delete outputs.logits which may be very large for this first iteration
+            # Otherwise a reference to outputs.logits is kept all along until after the next call to self.forward()
+            del outputs
+
+            if not sequential:
+                # Replicates the new past_key_values to match the `top_k` candidates
+                past = model_kwargs["past_key_values"]
+                # If it is a static cache, modify it in-place layer after layer to save memory
+                if isinstance(past, DynamicCache) or (
+                    isinstance(past, EncoderDecoderCache) and isinstance(past.self_attention_cache, DynamicCache)
+                ):
+                    past.batch_repeat_interleave(top_k)
+                else:
+                    new_key_values = []
+                    for layer in past:
+                        items = []
+                        # item is either the key or the value matrix
+                        for item in layer:
+                            items.append(item.repeat_interleave(top_k, dim=0))
+                        new_key_values.append(tuple(items))
+
+                    past = tuple(new_key_values)
+
+                model_kwargs["past_key_values"] = past
+
+            if sequential:
+                all_outputs = []
+                for i in range(top_k):
+                    # compute the candidate tokens by the language model and collect their hidden_states
+                    next_model_inputs = self.prepare_inputs_for_generation(top_k_ids[:, i].view(-1, 1), **model_kwargs)
+
+                    outputs = self(
+                        **next_model_inputs,
+                        return_dict=True,
+                        output_hidden_states=True,
+                        output_attentions=output_attentions,
+                    )
+                    if isinstance(outputs["past_key_values"], DynamicCache) or (
+                        isinstance(outputs["past_key_values"], EncoderDecoderCache)
+                        and isinstance(outputs["past_key_values"].self_attention_cache, DynamicCache)
+                    ):
+                        # Remove past K-V from output since we don't need to stack later
+                        outputs["past_key_values"] = None
+                        # Remove last token from past K-V since we don't want to append it at this point
+                        model_kwargs["past_key_values"].crop(-1)
+
+                    all_outputs.append(outputs)
+                outputs = stack_model_outputs(all_outputs, self.config.get_text_config())
+
+            else:
+                # compute the candidate tokens by the language model and collect their hidden_states
+                # assembles top_k_ids into batch of size k
+                next_model_inputs = self.prepare_inputs_for_generation(top_k_ids.view(-1, 1), **model_kwargs)
+
+                outputs = self(
+                    **next_model_inputs,
+                    return_dict=True,
+                    output_hidden_states=True,
+                    output_attentions=output_attentions,
+                )
+
+            # This is essential to avoid having a last reference to the big past K-V and double the necessary memory
+            # in the next loop
+            del next_model_inputs
+
+            # name is different for encoder-decoder and decoder-only models
+            if self.config.is_encoder_decoder:
+                next_hidden = outputs.decoder_hidden_states[-1]
+                full_hidden_states = outputs.decoder_hidden_states
+            else:
+                next_hidden = outputs.hidden_states[-1]
+                full_hidden_states = outputs.hidden_states
+
+            # .float() is needed to retain precision for later logits manipulations
+            logits = outputs.logits[:, -1, :].float()
+            context_hidden = last_hidden_states.repeat_interleave(top_k, dim=0)
+
+            # compute the degeneration penalty and re-rank the candidates based on the degeneration penalty and the
+            # model confidence. Keeping `selected_idx` on CPU enables multi-device contrastive search and doesn't
+            # introduce (noticeable) slowdowns on single-device runs.
+            selected_idx = _ranking_fast(
+                context_hidden, next_hidden, top_k_probs, cosine_matrix_mask, penalty_alpha, top_k
+            )
+            cosine_matrix_mask = torch.cat(
+                [cosine_matrix_mask, cosine_matrix_mask.new_ones((cosine_matrix_mask.shape[0], 1))], dim=-1
+            )
+            selected_idx = selected_idx.to("cpu")
+
+            # This will be used instead of the previous inneficient torch.stack(torch.split())
+            augmented_idx = torch.tensor([x + i * top_k for i, x in enumerate(selected_idx)])
+
+            # prepare for the next step: (1) next token_id; (2) past_key_values; (3) last_hidden_states for computing
+            # the degeneration penalty; (4) logits for selecting next top-k candidates; (5) selected tokens scores
+            # (model confidence minus degeneration penalty); (6) decoder hidden_states
+            next_tokens = top_k_ids[range(len(top_k_ids)), selected_idx]
+            next_hidden = torch.stack(torch.split(next_hidden.squeeze(dim=1), top_k))
+            next_hidden = next_hidden[range(batch_size), selected_idx, :]
+            last_hidden_states = torch.cat([last_hidden_states, next_hidden.unsqueeze(1)], dim=1)
+
+            next_decoder_hidden_states = ()
+            for layer in full_hidden_states:
+                layer = torch.stack(torch.split(layer, top_k))[range(batch_size), selected_idx, :]
+                next_decoder_hidden_states += (layer,)
+
+            # generate past_key_values cache of only the selected token
+            if sequential:
+                next_model_input = self.prepare_inputs_for_generation(
+                    top_k_ids[:, selected_idx].view(-1, 1), **model_kwargs
+                )
+
+                selected_outputs = self(
+                    **next_model_input,
+                    return_dict=True,
+                    output_hidden_states=False,
+                    output_attentions=False,
+                )
+                next_past_key_values = selected_outputs["past_key_values"]
+
+            else:
+                _, next_past_key_values = self._extract_past_from_model_output(outputs)
+                # Do it in-place layer per layer to save memory
+                if isinstance(next_past_key_values, DynamicCache) or (
+                    isinstance(next_past_key_values, EncoderDecoderCache)
+                    and isinstance(next_past_key_values.self_attention_cache, DynamicCache)
+                ):
+                    next_past_key_values.batch_select_indices(augmented_idx)
+                else:
+                    new_key_values = []
+                    for layer in next_past_key_values:
+                        items = []
+                        # item is either the key or the value matrix
+                        for item in layer:
+                            items.append(item[augmented_idx, ...])
+                        new_key_values.append(tuple(items))
+
+                    next_past_key_values = tuple(new_key_values)
+
+            logit_for_next_step = torch.stack(torch.split(logits, top_k))[range(batch_size), selected_idx, :]
+            logit_for_next_step = logit_for_next_step.to(input_ids.device)
+
+            # Rebuilds the relevant parts of the model output for the selected token, for use in the next iteration
+            if self.config.is_encoder_decoder:
+                next_step_cross_attentions = ()
+                next_step_decoder_attentions = ()
+                if output_attentions:
+                    for layer in outputs.cross_attentions:
+                        layer = torch.stack(torch.split(layer, top_k, dim=0))[range(batch_size), selected_idx, ...]
+                        next_step_cross_attentions += (layer,)
+                    for layer in outputs.decoder_attentions:
+                        layer = torch.stack(torch.split(layer, top_k, dim=0))[range(batch_size), selected_idx, ...]
+                        next_step_decoder_attentions += (layer,)
+                outputs = Seq2SeqLMOutput(
+                    past_key_values=next_past_key_values,
+                    decoder_hidden_states=next_decoder_hidden_states,
+                    decoder_attentions=next_step_decoder_attentions or None,
+                    cross_attentions=next_step_cross_attentions or None,
+                )
+            else:
+                next_step_attentions = ()
+                if output_attentions:
+                    for layer in outputs.attentions:
+                        layer = torch.stack(torch.split(layer, top_k, dim=0))[range(batch_size), selected_idx, ...]
+                        next_step_attentions += (layer,)
+                outputs = CausalLMOutputWithPast(
+                    past_key_values=next_past_key_values,
+                    hidden_states=next_decoder_hidden_states,
+                    attentions=next_step_attentions or None,
+                )
+            # contrastive_search main logic end
+
+            # synced_gpus: don't waste resources running the code we don't need; kwargs must be updated before skipping
+            model_kwargs = self._update_model_kwargs_for_generation(
+                outputs,
+                model_kwargs,
+                is_encoder_decoder=self.config.is_encoder_decoder,
+            )
+            if synced_gpus and this_peer_finished:
+                continue
+
+            # finished sentences should have their next token be a padding token
+            if has_eos_stopping_criteria:
+                next_tokens = next_tokens * unfinished_sequences + pad_token_id * (1 - unfinished_sequences)
+
+            # update generated ids, model inputs, and length for next step
+            input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1)
+            if streamer is not None:
+                streamer.put(next_tokens.cpu())
+
+            # stop when each sentence is finished
+            unfinished_sequences = unfinished_sequences & ~stopping_criteria(input_ids, scores)
+            this_peer_finished = unfinished_sequences.max() == 0
+
+        if streamer is not None:
+            streamer.end()
+
+        if return_dict_in_generate:
+            # Contrastive search works by forward looking at the next token, so we need to exclude it from
+            # `past_key_values` to be consistent with the other decoding methods
+            if model_kwargs.get("past_key_values") is not None:
+                if isinstance(model_kwargs["past_key_values"], DynamicCache) or (
+                    isinstance(model_kwargs["past_key_values"], EncoderDecoderCache)
+                    and isinstance(model_kwargs["past_key_values"].self_attention_cache, DynamicCache)
+                ):
+                    model_kwargs["past_key_values"].crop(-1)
+                else:
+                    past_key_values = []
+                    for layer in model_kwargs["past_key_values"]:
+                        layer_past_key_values = []
+                        for item in layer:
+                            layer_past_key_values.append(item[..., :-1, :])
+                        past_key_values.append(tuple(layer_past_key_values))
+                    model_kwargs["past_key_values"] = tuple(past_key_values)
+
+            if self.config.is_encoder_decoder:
+                return GenerateEncoderDecoderOutput(
+                    sequences=input_ids,
+                    scores=scores,
+                    logits=raw_logits,
+                    encoder_attentions=encoder_attentions,
+                    encoder_hidden_states=encoder_hidden_states,
+                    decoder_attentions=decoder_attentions,
+                    cross_attentions=cross_attentions,
+                    decoder_hidden_states=decoder_hidden_states,
+                    past_key_values=model_kwargs.get("past_key_values"),
+                )
+            else:
+                return GenerateDecoderOnlyOutput(
+                    sequences=input_ids,
+                    scores=scores,
+                    logits=raw_logits,
+                    attentions=decoder_attentions,
+                    hidden_states=decoder_hidden_states,
+                    past_key_values=model_kwargs.get("past_key_values"),
+                )
+        else:
+            return input_ids
+
+    def _sample(
+        self,
+        input_ids: torch.LongTensor,
+        logits_processor: LogitsProcessorList,
+        stopping_criteria: StoppingCriteriaList,
+        generation_config: GenerationConfig,
+        synced_gpus: bool,
+        streamer: Optional["BaseStreamer"],
+        **model_kwargs,
+    ) -> Union[GenerateNonBeamOutput, torch.LongTensor]:
+        r"""
+        Generates sequences of token ids for models with a language modeling head using **multinomial sampling** and
+        can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models.
+
+        Parameters:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                The sequence used as a prompt for the generation.
+            logits_processor (`LogitsProcessorList`):
+                An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`]
+                used to modify the prediction scores of the language modeling head applied at each generation step.
+            stopping_criteria (`StoppingCriteriaList`):
+                An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`]
+                used to tell if the generation loop should stop.
+            generation_config ([`~generation.GenerationConfig`]):
+                The generation configuration to be used as parametrization of the decoding method.
+            synced_gpus (`bool`):
+                Whether to continue running the while loop until max_length (needed to avoid deadlocking with
+                `FullyShardedDataParallel` and DeepSpeed ZeRO Stage 3).
+            streamer (`BaseStreamer`, *optional*):
+                Streamer object that will be used to stream the generated sequences. Generated tokens are passed
+                through `streamer.put(token_ids)` and the streamer is responsible for any further processing.
+            model_kwargs:
+                Additional model specific kwargs will be forwarded to the `forward` function of the model. If model is
+                an encoder-decoder model the kwargs should include `encoder_outputs`.
+
+        Return:
+            [`~generation.GenerateDecoderOnlyOutput`], [`~generation.GenerateEncoderDecoderOutput`] or `torch.LongTensor`:
+            A `torch.LongTensor` containing the generated tokens (default behaviour) or a
+            [`~generation.GenerateDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and
+            `return_dict_in_generate=True` or a [`~generation.GenerateEncoderDecoderOutput`] if
+            `model.config.is_encoder_decoder=True`.
+        """
+        # init values
+        pad_token_id = generation_config._pad_token_tensor
+        output_attentions = generation_config.output_attentions
+        output_hidden_states = generation_config.output_hidden_states
+        output_scores = generation_config.output_scores
+        output_logits = generation_config.output_logits
+        return_dict_in_generate = generation_config.return_dict_in_generate
+        max_length = generation_config.max_length
+        has_eos_stopping_criteria = any(hasattr(criteria, "eos_token_id") for criteria in stopping_criteria)
+        do_sample = generation_config.do_sample
+
+        # init attention / hidden states / scores tuples
+        scores = () if (return_dict_in_generate and output_scores) else None
+        raw_logits = () if (return_dict_in_generate and output_logits) else None
+        decoder_attentions = () if (return_dict_in_generate and output_attentions) else None
+        cross_attentions = () if (return_dict_in_generate and output_attentions) else None
+        decoder_hidden_states = () if (return_dict_in_generate and output_hidden_states) else None
+
+        # if model is an encoder-decoder, retrieve encoder attention weights and hidden states
+        if return_dict_in_generate and self.config.is_encoder_decoder:
+            encoder_attentions = model_kwargs["encoder_outputs"].get("attentions") if output_attentions else None
+            encoder_hidden_states = (
+                model_kwargs["encoder_outputs"].get("hidden_states") if output_hidden_states else None
+            )
+
+        # keep track of which sequences are already finished
+        batch_size, cur_len = input_ids.shape
+        this_peer_finished = False
+        unfinished_sequences = torch.ones(batch_size, dtype=torch.long, device=input_ids.device)
+        model_kwargs = self._get_initial_cache_position(input_ids, model_kwargs)
+
+        model_forward = self.__call__
+        if isinstance(model_kwargs.get("past_key_values"), StaticCache):
+            if self.device.type == "cuda":
+                logger.warning_once("Using `torch.compile`.")
+                os.environ["TOKENIZERS_PARALLELISM"] = "0"
+                model_forward = self.get_compiled_call(generation_config.compile_config)
+
+        is_prefill = True
+        while self._has_unfinished_sequences(
+            this_peer_finished, synced_gpus, device=input_ids.device, cur_len=cur_len, max_length=max_length
+        ):
+            # prepare model inputs
+            model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
+
+            # prepare variable output controls (note: some models won't accept all output controls)
+            model_inputs.update({"output_attentions": output_attentions} if output_attentions else {})
+            model_inputs.update({"output_hidden_states": output_hidden_states} if output_hidden_states else {})
+
+            if is_prefill:
+                outputs = self(**model_inputs, return_dict=True)
+                is_prefill = False
+            else:
+                outputs = model_forward(**model_inputs, return_dict=True)
+
+            # synced_gpus: don't waste resources running the code we don't need; kwargs must be updated before skipping
+            model_kwargs = self._update_model_kwargs_for_generation(
+                outputs,
+                model_kwargs,
+                is_encoder_decoder=self.config.is_encoder_decoder,
+            )
+            if synced_gpus and this_peer_finished:
+                continue
+
+            # Clone is needed to avoid keeping a hanging ref to outputs.logits which may be very large for first iteration
+            # (the clone itself is always small)
+            next_token_logits = outputs.logits[:, -1, :].clone().float()
+            next_token_logits = next_token_logits.to(input_ids.device)
+
+            # pre-process distribution
+            next_token_scores = logits_processor(input_ids, next_token_logits)
+
+            # Store scores, attentions and hidden_states when required
+            if return_dict_in_generate:
+                if output_scores:
+                    scores += (next_token_scores,)
+                if output_logits:
+                    raw_logits += (next_token_logits,)
+                if output_attentions:
+                    decoder_attentions += (
+                        (outputs.decoder_attentions,) if self.config.is_encoder_decoder else (outputs.attentions,)
+                    )
+                    if self.config.is_encoder_decoder:
+                        cross_attentions += (outputs.cross_attentions,)
+
+                if output_hidden_states:
+                    decoder_hidden_states += (
+                        (outputs.decoder_hidden_states,)
+                        if self.config.is_encoder_decoder
+                        else (outputs.hidden_states,)
+                    )
+
+            # token selection
+            if do_sample:
+                probs = nn.functional.softmax(next_token_scores, dim=-1)
+                # TODO (joao): this OP throws "skipping cudagraphs due to ['incompatible ops']", find solution
+                next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1)
+            else:
+                next_tokens = torch.argmax(next_token_scores, dim=-1)
+
+            # finished sentences should have their next token be a padding token
+            if has_eos_stopping_criteria:
+                next_tokens = next_tokens * unfinished_sequences + pad_token_id * (1 - unfinished_sequences)
+
+            # update generated ids, model inputs, and length for next step
+            input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1)
+            if streamer is not None:
+                streamer.put(next_tokens.cpu())
+
+            unfinished_sequences = unfinished_sequences & ~stopping_criteria(input_ids, scores)
+            this_peer_finished = unfinished_sequences.max() == 0
+            cur_len += 1
+
+            # This is needed to properly delete outputs.logits which may be very large for first iteration
+            # Otherwise a reference to outputs is kept which keeps the logits alive in the next iteration
+            del outputs
+
+        if streamer is not None:
+            streamer.end()
+
+        if return_dict_in_generate:
+            if self.config.is_encoder_decoder:
+                return GenerateEncoderDecoderOutput(
+                    sequences=input_ids,
+                    scores=scores,
+                    logits=raw_logits,
+                    encoder_attentions=encoder_attentions,
+                    encoder_hidden_states=encoder_hidden_states,
+                    decoder_attentions=decoder_attentions,
+                    cross_attentions=cross_attentions,
+                    decoder_hidden_states=decoder_hidden_states,
+                    past_key_values=model_kwargs.get("past_key_values"),
+                )
+            else:
+                return GenerateDecoderOnlyOutput(
+                    sequences=input_ids,
+                    scores=scores,
+                    logits=raw_logits,
+                    attentions=decoder_attentions,
+                    hidden_states=decoder_hidden_states,
+                    past_key_values=model_kwargs.get("past_key_values"),
+                )
+        else:
+            return input_ids
+
+    def _temporary_reorder_cache(self, past_key_values, beam_idx):
+        """
+        Temporary function to handle the different types of cache reordering processes while we roll out `Cache`.
+
+        TODO: standardize cache formats and make all models compatible with `Cache`. It would remove the need
+        for this function, with `Cache.reorder_cache` being the sole remaining code path
+        """
+        model_class = self.__class__.__name__.lower()
+        # Exception 1: code path for models using the legacy cache format
+        if isinstance(past_key_values, (tuple, list)):
+            past_key_values = self._reorder_cache(past_key_values, beam_idx)
+        # Exception 2: models with different cache formats. These are limited to `DynamicCache` until their
+        # cache format is standardized, to avoid adding complexity to the codebase.
+        elif "gptbigcode" in model_class:
+            if not isinstance(past_key_values, (DynamicCache, EncoderDecoderCache)):
+                raise ValueError(
+                    f"Using an unsupported cache format with {model_class}. Currently, it only supports the "
+                    "legacy tuple format or `DynamicCache`"
+                )
+            past_key_values = self._reorder_cache(past_key_values, beam_idx)
+            past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+        # Standard code path: use the `Cache.reorder_cache`
+        else:
+            past_key_values.reorder_cache(beam_idx)
+        return past_key_values
+
+    def _beam_search(
+        self,
+        input_ids: torch.LongTensor,
+        beam_scorer: BeamScorer,
+        logits_processor: LogitsProcessorList,
+        stopping_criteria: StoppingCriteriaList,
+        generation_config: GenerationConfig,
+        synced_gpus: bool,
+        **model_kwargs,
+    ) -> Union[GenerateBeamOutput, torch.LongTensor]:
+        r"""
+        Generates sequences of token ids for models with a language modeling head using **beam search decoding** and
+        can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models.
+
+        Parameters:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                The sequence used as a prompt for the generation.
+            beam_scorer (`BeamScorer`):
+                An derived instance of [`BeamScorer`] that defines how beam hypotheses are constructed, stored and
+                sorted during generation. For more information, the documentation of [`BeamScorer`] should be read.
+            logits_processor (`LogitsProcessorList`):
+                An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`]
+                used to modify the prediction scores of the language modeling head applied at each generation step.
+            stopping_criteria (`StoppingCriteriaList`:
+                An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`]
+                used to tell if the generation loop should stop.
+            generation_config ([`~generation.GenerationConfig`]):
+                The generation configuration to be used as parametrization of the decoding method.
+            synced_gpus (`bool`):
+                Whether to continue running the while loop until max_length (needed to avoid deadlocking with
+                `FullyShardedDataParallel` and DeepSpeed ZeRO Stage 3).
+            model_kwargs:
+                Additional model specific kwargs will be forwarded to the `forward` function of the model. If model is
+                an encoder-decoder model the kwargs should include `encoder_outputs`.
+
+        Return:
+            [`generation.GenerateBeamDecoderOnlyOutput`], [`~generation.GenerateBeamEncoderDecoderOutput`] or
+            `torch.LongTensor`: A `torch.LongTensor` containing the generated tokens (default behaviour) or a
+            [`~generation.GenerateBeamDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and
+            `return_dict_in_generate=True` or a [`~generation.GenerateBeamEncoderDecoderOutput`] if
+            `model.config.is_encoder_decoder=True`.
+        """
+        # init values
+        pad_token_id = generation_config._pad_token_tensor
+        eos_token_id = generation_config._eos_token_tensor
+        output_attentions = generation_config.output_attentions
+        output_hidden_states = generation_config.output_hidden_states
+        output_scores = generation_config.output_scores
+        output_logits = generation_config.output_logits
+        return_dict_in_generate = generation_config.return_dict_in_generate
+        sequential = generation_config.low_memory
+        do_sample = generation_config.do_sample
+
+        batch_size = len(beam_scorer._beam_hyps)
+        num_beams = beam_scorer.num_beams
+
+        batch_beam_size, cur_len = input_ids.shape
+        model_kwargs = self._get_initial_cache_position(input_ids, model_kwargs)
+
+        if num_beams * batch_size != batch_beam_size:
+            raise ValueError(
+                f"Batch dimension of `input_ids` should be {num_beams * batch_size}, but is {batch_beam_size}."
+            )
+
+        # init attention / hidden states / scores tuples
+        scores = () if (return_dict_in_generate and output_scores) else None
+        raw_logits = () if (return_dict_in_generate and output_logits) else None
+        beam_indices = (
+            tuple(() for _ in range(batch_beam_size)) if (return_dict_in_generate and output_scores) else None
+        )
+        decoder_attentions = () if (return_dict_in_generate and output_attentions) else None
+        cross_attentions = () if (return_dict_in_generate and output_attentions) else None
+        decoder_hidden_states = () if (return_dict_in_generate and output_hidden_states) else None
+
+        # if model is an encoder-decoder, retrieve encoder attention weights and hidden states
+        if return_dict_in_generate and self.config.is_encoder_decoder:
+            encoder_attentions = model_kwargs["encoder_outputs"].get("attentions") if output_attentions else None
+            encoder_hidden_states = (
+                model_kwargs["encoder_outputs"].get("hidden_states") if output_hidden_states else None
+            )
+
+        # initialise score of first beam with 0 and the rest with -1e9. This makes sure that only tokens
+        # of the first beam are considered to avoid sampling the exact same tokens across all beams.
+        beam_scores = torch.zeros((batch_size, num_beams), dtype=torch.float, device=input_ids.device)
+        beam_scores[:, 1:] = -1e9
+        beam_scores = beam_scores.view((batch_size * num_beams,))
+
+        this_peer_finished = False
+
+        decoder_prompt_len = input_ids.shape[-1]  # record the prompt length of decoder
+
+        while self._has_unfinished_sequences(this_peer_finished, synced_gpus, device=input_ids.device):
+            model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
+
+            # prepare variable output controls (note: some models won't accept all output controls)
+            model_inputs.update({"output_attentions": output_attentions} if output_attentions else {})
+            model_inputs.update({"output_hidden_states": output_hidden_states} if output_hidden_states else {})
+
+            # if sequential is True, split the input to batches of batch_size and run sequentially
+            if sequential:
+                if any(
+                    model_name in self.__class__.__name__.lower()
+                    for model_name in [
+                        "fsmt",
+                        "reformer",
+                        "ctrl",
+                        "gpt_bigcode",
+                        "transo_xl",
+                        "xlnet",
+                        "cpm",
+                        "jamba",
+                    ]
+                ):
+                    raise RuntimeError(
+                        f"Currently generation for {self.__class__.__name__} is not supported "
+                        f"for `low_memory beam_search`. Please open an issue on GitHub if you need this feature."
+                    )
+
+                inputs_per_sub_batches = _split_model_inputs(
+                    model_inputs,
+                    split_size=batch_size,
+                    full_batch_size=batch_beam_size,
+                    config=self.config.get_text_config(),
+                )
+                outputs_per_sub_batch = [
+                    self(**inputs_per_sub_batch, return_dict=True) for inputs_per_sub_batch in inputs_per_sub_batches
+                ]
+
+                outputs = stack_model_outputs(outputs_per_sub_batch, self.config.get_text_config())
+
+            else:  # Unchanged original behavior
+                outputs = self(**model_inputs, return_dict=True)
+
+            # synced_gpus: don't waste resources running the code we don't need; kwargs must be updated before skipping
+            model_kwargs = self._update_model_kwargs_for_generation(
+                outputs,
+                model_kwargs,
+                is_encoder_decoder=self.config.is_encoder_decoder,
+            )
+            if synced_gpus and this_peer_finished:
+                cur_len = cur_len + 1
+                continue
+
+            # Clone is needed to avoid keeping a hanging ref to outputs.logits which may be very large for first iteration
+            # (the clone itself is always small)
+            # .float() is needed to retain precision for later logits manipulations
+            next_token_logits = outputs.logits[:, -1, :].clone().float()
+            next_token_logits = next_token_logits.to(input_ids.device)
+            next_token_scores = nn.functional.log_softmax(
+                next_token_logits, dim=-1
+            )  # (batch_size * num_beams, vocab_size)
+
+            next_token_scores_processed = logits_processor(input_ids, next_token_scores)
+            next_token_scores = next_token_scores_processed + beam_scores[:, None].expand_as(
+                next_token_scores_processed
+            )
+
+            # Store scores, attentions and hidden_states when required
+            if return_dict_in_generate:
+                if output_scores:
+                    scores += (next_token_scores_processed,)
+                if output_logits:
+                    raw_logits += (next_token_logits,)
+                if output_attentions:
+                    decoder_attentions += (
+                        (outputs.decoder_attentions,) if self.config.is_encoder_decoder else (outputs.attentions,)
+                    )
+                    if self.config.is_encoder_decoder:
+                        cross_attentions += (outputs.cross_attentions,)
+                if output_hidden_states:
+                    decoder_hidden_states += (
+                        (outputs.decoder_hidden_states,)
+                        if self.config.is_encoder_decoder
+                        else (outputs.hidden_states,)
+                    )
+
+            # reshape for beam search
+            vocab_size = next_token_scores.shape[-1]
+            next_token_scores = next_token_scores.view(batch_size, num_beams * vocab_size)
+
+            # Beam token selection: pick 1 + eos_token_id.shape[0] next tokens for each beam so we have at least 1
+            # non eos token per beam.
+            n_eos_tokens = eos_token_id.shape[0] if eos_token_id is not None else 0
+            n_tokens_to_keep = max(2, 1 + n_eos_tokens) * num_beams
+            if do_sample:
+                probs = nn.functional.softmax(next_token_scores, dim=-1)
+                next_tokens = torch.multinomial(probs, num_samples=n_tokens_to_keep)
+                next_token_scores = torch.gather(next_token_scores, -1, next_tokens)
+                next_token_scores, _indices = torch.sort(next_token_scores, descending=True, dim=1)
+                next_tokens = torch.gather(next_tokens, -1, _indices)
+            else:
+                next_token_scores, next_tokens = torch.topk(
+                    next_token_scores, n_tokens_to_keep, dim=1, largest=True, sorted=True
+                )
+
+            next_indices = torch.div(next_tokens, vocab_size, rounding_mode="floor")
+            next_tokens = next_tokens % vocab_size
+
+            # stateless
+            beam_outputs = beam_scorer.process(
+                input_ids,
+                next_token_scores,
+                next_tokens,
+                next_indices,
+                pad_token_id=pad_token_id,
+                eos_token_id=eos_token_id,
+                beam_indices=beam_indices,
+                decoder_prompt_len=decoder_prompt_len,
+            )
+
+            beam_scores = beam_outputs["next_beam_scores"]
+            beam_next_tokens = beam_outputs["next_beam_tokens"]
+            beam_idx = beam_outputs["next_beam_indices"]
+
+            input_ids = torch.cat([input_ids[beam_idx, :], beam_next_tokens.unsqueeze(-1)], dim=-1)
+
+            # This is needed to properly delete outputs.logits which may be very large for first iteration
+            # Otherwise a reference to outputs is kept which keeps the logits alive in the next iteration
+            # IMPORTANT: Note that this should appear BEFORE the call to _reorder_cache() to save the maximum memory
+            # (that way the memory peak does not include outputs.logits)
+            del outputs
+
+            if model_kwargs.get("past_key_values", None) is not None:
+                model_kwargs["past_key_values"] = self._temporary_reorder_cache(
+                    model_kwargs["past_key_values"], beam_idx
+                )
+
+            if return_dict_in_generate and output_scores:
+                beam_indices = tuple((beam_indices[beam_idx[i]] + (beam_idx[i],) for i in range(len(beam_indices))))
+
+            # increase cur_len
+            cur_len = cur_len + 1
+
+            if beam_scorer.is_done or all(stopping_criteria(input_ids, scores)):
+                this_peer_finished = True
+
+        sequence_outputs = beam_scorer.finalize(
+            input_ids,
+            beam_scores,
+            next_tokens,
+            next_indices,
+            pad_token_id=pad_token_id,
+            eos_token_id=eos_token_id,
+            max_length=stopping_criteria.max_length,
+            beam_indices=beam_indices,
+            decoder_prompt_len=decoder_prompt_len,
+        )
+
+        if return_dict_in_generate:
+            if not output_scores:
+                sequence_outputs["sequence_scores"] = None
+
+            if self.config.is_encoder_decoder:
+                return GenerateBeamEncoderDecoderOutput(
+                    sequences=sequence_outputs["sequences"],
+                    sequences_scores=sequence_outputs["sequence_scores"],
+                    scores=scores,
+                    logits=raw_logits,
+                    beam_indices=sequence_outputs["beam_indices"],
+                    encoder_attentions=encoder_attentions,
+                    encoder_hidden_states=encoder_hidden_states,
+                    decoder_attentions=decoder_attentions,
+                    cross_attentions=cross_attentions,
+                    decoder_hidden_states=decoder_hidden_states,
+                    past_key_values=model_kwargs.get("past_key_values"),
+                )
+            else:
+                return GenerateBeamDecoderOnlyOutput(
+                    sequences=sequence_outputs["sequences"],
+                    sequences_scores=sequence_outputs["sequence_scores"],
+                    scores=scores,
+                    logits=raw_logits,
+                    beam_indices=sequence_outputs["beam_indices"],
+                    attentions=decoder_attentions,
+                    hidden_states=decoder_hidden_states,
+                    past_key_values=model_kwargs.get("past_key_values"),
+                )
+        else:
+            return sequence_outputs["sequences"]
+
+    def _group_beam_search(
+        self,
+        input_ids: torch.LongTensor,
+        beam_scorer: BeamScorer,
+        logits_processor: LogitsProcessorList,
+        stopping_criteria: StoppingCriteriaList,
+        generation_config: GenerationConfig,
+        synced_gpus: bool,
+        **model_kwargs,
+    ):
+        r"""
+        Generates sequences of token ids for models with a language modeling head using **diverse beam search
+        decoding** and can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models.
+
+        Parameters:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                The sequence used as a prompt for the generation.
+            beam_scorer (`BeamScorer`):
+                An derived instance of [`BeamScorer`] that defines how beam hypotheses are constructed, stored and
+                sorted during generation. For more information, the documentation of [`BeamScorer`] should be read.
+            logits_processor (`LogitsProcessorList`):
+                An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`]
+                used to modify the prediction scores of the language modeling head applied at each generation step.
+            stopping_criteria (`StoppingCriteriaList`):
+                An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`]
+                used to tell if the generation loop should stop.
+            generation_config ([`~generation.GenerationConfig`]):
+                The generation configuration to be used as parametrization of the decoding method.
+            synced_gpus (`bool`):
+                Whether to continue running the while loop until max_length (needed to avoid deadlocking with
+                `FullyShardedDataParallel` and DeepSpeed ZeRO Stage 3).
+            model_kwargs:
+                Additional model specific kwargs that will be forwarded to the `forward` function of the model. If
+                model is an encoder-decoder model the kwargs should include `encoder_outputs`.
+
+        Return:
+            [`~generation.GenerateBeamDecoderOnlyOutput`], [`~generation.GenerateBeamEncoderDecoderOutput`] or
+            `torch.LongTensor`: A `torch.LongTensor` containing the generated tokens (default behaviour) or a
+            [`~generation.GenerateBeamDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and
+            `return_dict_in_generate=True` or a [`~generation.GenerateBeamEncoderDecoderOutput`] if
+            `model.config.is_encoder_decoder=True`.
+        """
+        # init values
+        pad_token_id = generation_config._pad_token_tensor
+        eos_token_id = generation_config._eos_token_tensor
+        output_attentions = generation_config.output_attentions
+        output_hidden_states = generation_config.output_hidden_states
+        output_scores = generation_config.output_scores
+        output_logits = generation_config.output_logits
+        return_dict_in_generate = generation_config.return_dict_in_generate
+
+        num_beams = beam_scorer.num_beams
+        num_beam_groups = beam_scorer.num_beam_groups
+        num_sub_beams = num_beams // num_beam_groups
+        batch_size = len(beam_scorer._beam_hyps) // num_beam_groups
+        device = input_ids.device
+
+        batch_beam_size, cur_len = input_ids.shape
+        model_kwargs = self._get_initial_cache_position(input_ids, model_kwargs)
+
+        if return_dict_in_generate and output_scores:
+            beam_indices = [tuple(() for _ in range(num_sub_beams * batch_size)) for _ in range(num_beam_groups)]
+        else:
+            beam_indices = None
+
+        if num_beams * batch_size != batch_beam_size:
+            raise ValueError(
+                f"Batch dimension of `input_ids` should be {num_beams * batch_size}, but is {batch_beam_size}."
+            )
+
+        # init attention / hidden states / scores tuples
+        scores = () if (return_dict_in_generate and output_scores) else None
+        raw_logits = () if (return_dict_in_generate and output_logits) else None
+        decoder_attentions = () if (return_dict_in_generate and output_attentions) else None
+        cross_attentions = () if (return_dict_in_generate and output_attentions) else None
+        decoder_hidden_states = () if (return_dict_in_generate and output_hidden_states) else None
+
+        # if model is an encoder-decoder, retrieve encoder attention weights and hidden states
+        if return_dict_in_generate and self.config.is_encoder_decoder:
+            encoder_attentions = model_kwargs["encoder_outputs"].get("attentions") if output_attentions else None
+            encoder_hidden_states = (
+                model_kwargs["encoder_outputs"].get("hidden_states") if output_hidden_states else None
+            )
+
+        # initialise score of first beam of each group with 0 and the rest with -1e9. This ensures that the beams in
+        # the same group don't produce same tokens every time.
+        beam_scores = torch.full((batch_size, num_beams), -1e9, dtype=torch.float, device=device)
+        beam_scores[:, ::num_sub_beams] = 0
+        beam_scores = beam_scores.view((batch_size * num_beams,))
+
+        this_peer_finished = False
+
+        decoder_prompt_len = input_ids.shape[-1]  # record the prompt length of decoder
+        while self._has_unfinished_sequences(this_peer_finished, synced_gpus, device=input_ids.device):
+            # predicted tokens in cur_len step
+            current_tokens = torch.zeros(batch_size * num_beams, dtype=input_ids.dtype, device=device)
+
+            # indices which will form the beams in the next time step
+            reordering_indices = torch.zeros(batch_size * num_beams, dtype=torch.long, device=device)
+
+            # do one decoder step on all beams of all sentences in batch
+            model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
+
+            # prepare variable output controls (note: some models won't accept all output controls)
+            model_inputs.update({"output_attentions": output_attentions} if output_attentions else {})
+            model_inputs.update({"output_hidden_states": output_hidden_states} if output_hidden_states else {})
+
+            outputs = self(**model_inputs, return_dict=True)
+
+            # synced_gpus: don't waste resources running the code we don't need; kwargs must be updated before skipping
+            model_kwargs = self._update_model_kwargs_for_generation(
+                outputs,
+                model_kwargs,
+                is_encoder_decoder=self.config.is_encoder_decoder,
+            )
+            if synced_gpus and this_peer_finished:
+                cur_len = cur_len + 1
+                continue
+
+            if output_scores:
+                processed_score = torch.zeros_like(outputs.logits[:, -1, :])
+            if output_logits:
+                # Clone is needed to avoid keeping a hanging ref to outputs.logits which may be very large for first iteration
+                # (the clone itself is always small)
+                raw_logit_score = outputs.logits[:, -1, :].clone()
+                raw_logit_score = raw_logit_score.to(input_ids.device)
+
+            for beam_group_idx in range(num_beam_groups):
+                group_start_idx = beam_group_idx * num_sub_beams
+                group_end_idx = min(group_start_idx + num_sub_beams, num_beams)
+                group_size = group_end_idx - group_start_idx
+
+                # indices of beams of current group among all sentences in batch
+                batch_group_indices = []
+
+                for batch_idx in range(batch_size):
+                    batch_group_indices.extend(
+                        [batch_idx * num_beams + idx for idx in range(group_start_idx, group_end_idx)]
+                    )
+                group_input_ids = input_ids[batch_group_indices]
+
+                # select outputs of beams of current group only
+                # No need to clone() the logits here as they will not retain outputs.logits at the end of the loop
+                # .float() is needed to retain precision for later logits manipulations
+                next_token_logits = outputs.logits[batch_group_indices, -1, :].float()
+                next_token_logits = next_token_logits.to(input_ids.device)
+
+                next_token_scores = nn.functional.log_softmax(
+                    next_token_logits, dim=-1
+                )  # (batch_size * group_size, vocab_size)
+                vocab_size = next_token_scores.shape[-1]
+
+                next_token_scores_processed = logits_processor(
+                    group_input_ids, next_token_scores, current_tokens=current_tokens, beam_group_idx=beam_group_idx
+                )
+                next_token_scores = next_token_scores_processed + beam_scores[batch_group_indices].unsqueeze(-1)
+                next_token_scores = next_token_scores.expand_as(next_token_scores_processed)
+
+                if output_scores:
+                    processed_score[batch_group_indices] = next_token_scores_processed
+
+                # reshape for beam search
+                next_token_scores = next_token_scores.view(batch_size, group_size * vocab_size)
+
+                # Sample 1 + len(eos_token_id) next tokens for each beam so we have at least 1 non eos token per beam.
+                n_eos_tokens = eos_token_id.shape[0] if eos_token_id is not None else 0
+                next_token_scores, next_tokens = torch.topk(
+                    next_token_scores, max(2, 1 + n_eos_tokens) * group_size, dim=1, largest=True, sorted=True
+                )
+
+                next_indices = torch.div(next_tokens, vocab_size, rounding_mode="floor")
+                next_tokens = next_tokens % vocab_size
+
+                # stateless
+                process_beam_indices = sum(beam_indices, ()) if beam_indices is not None else None
+                beam_outputs = beam_scorer.process(
+                    group_input_ids,
+                    next_token_scores,
+                    next_tokens,
+                    next_indices,
+                    pad_token_id=pad_token_id,
+                    eos_token_id=eos_token_id,
+                    beam_indices=process_beam_indices,
+                    group_index=beam_group_idx,
+                    decoder_prompt_len=decoder_prompt_len,
+                )
+                beam_scores[batch_group_indices] = beam_outputs["next_beam_scores"]
+                beam_next_tokens = beam_outputs["next_beam_tokens"]
+                beam_idx = beam_outputs["next_beam_indices"]
+
+                if return_dict_in_generate and output_scores:
+                    beam_indices[beam_group_idx] = tuple(
+                        beam_indices[beam_group_idx][beam_idx[i]] + (beam_idx[i],) for i in range(len(beam_indices[0]))
+                    )
+
+                input_ids[batch_group_indices] = group_input_ids[beam_idx]
+                group_input_ids = torch.cat([group_input_ids[beam_idx, :], beam_next_tokens.unsqueeze(-1)], dim=-1)
+                current_tokens[batch_group_indices] = group_input_ids[:, -1]
+
+                # (beam_idx // group_size) -> batch_idx
+                # (beam_idx % group_size) -> offset of idx inside the group
+                reordering_indices[batch_group_indices] = (
+                    num_beams * torch.div(beam_idx, group_size, rounding_mode="floor")
+                    + group_start_idx
+                    + (beam_idx % group_size)
+                )
+
+            # Store scores, attentions and hidden_states when required
+            if return_dict_in_generate:
+                if output_scores:
+                    scores += (processed_score,)
+                if output_logits:
+                    raw_logits += (raw_logit_score,)
+                if output_attentions:
+                    decoder_attentions += (
+                        (outputs.decoder_attentions,) if self.config.is_encoder_decoder else (outputs.attentions,)
+                    )
+                    if self.config.is_encoder_decoder:
+                        cross_attentions += (outputs.cross_attentions,)
+
+                if output_hidden_states:
+                    decoder_hidden_states += (
+                        (outputs.decoder_hidden_states,)
+                        if self.config.is_encoder_decoder
+                        else (outputs.hidden_states,)
+                    )
+
+            input_ids = torch.cat([input_ids, current_tokens.unsqueeze(-1)], dim=-1)
+
+            # This is needed to properly delete outputs.logits which may be very large for first iteration
+            # Otherwise a reference to outputs is kept which keeps the logits alive in the next iteration
+            # IMPORTANT: Note that this should appear BEFORE the call to _reorder_cache() to save the maximum memory
+            # (that way the memory peak does not include outputs.logits)
+            del outputs
+
+            if model_kwargs.get("past_key_values", None) is not None:
+                model_kwargs["past_key_values"] = self._temporary_reorder_cache(
+                    model_kwargs["past_key_values"], reordering_indices
+                )
+
+            # increase cur_len
+            cur_len = cur_len + 1
+
+            if beam_scorer.is_done or all(stopping_criteria(input_ids, scores)):
+                this_peer_finished = True
+
+        final_beam_indices = sum(beam_indices, ()) if beam_indices is not None else None
+        sequence_outputs = beam_scorer.finalize(
+            input_ids,
+            beam_scores,
+            next_tokens,
+            next_indices,
+            pad_token_id=pad_token_id,
+            eos_token_id=eos_token_id,
+            max_length=stopping_criteria.max_length,
+            beam_indices=final_beam_indices,
+            decoder_prompt_len=decoder_prompt_len,
+        )
+
+        if return_dict_in_generate:
+            if not output_scores:
+                sequence_outputs["sequence_scores"] = None
+
+            if self.config.is_encoder_decoder:
+                return GenerateBeamEncoderDecoderOutput(
+                    sequences=sequence_outputs["sequences"],
+                    sequences_scores=sequence_outputs["sequence_scores"],
+                    scores=scores,
+                    logits=raw_logits,
+                    beam_indices=sequence_outputs["beam_indices"],
+                    encoder_attentions=encoder_attentions,
+                    encoder_hidden_states=encoder_hidden_states,
+                    decoder_attentions=decoder_attentions,
+                    cross_attentions=cross_attentions,
+                    decoder_hidden_states=decoder_hidden_states,
+                    past_key_values=model_kwargs.get("past_key_values"),
+                )
+            else:
+                return GenerateBeamDecoderOnlyOutput(
+                    sequences=sequence_outputs["sequences"],
+                    sequences_scores=sequence_outputs["sequence_scores"],
+                    scores=scores,
+                    logits=raw_logits,
+                    beam_indices=sequence_outputs["beam_indices"],
+                    attentions=decoder_attentions,
+                    hidden_states=decoder_hidden_states,
+                    past_key_values=model_kwargs.get("past_key_values"),
+                )
+        else:
+            return sequence_outputs["sequences"]
+
+    def _constrained_beam_search(
+        self,
+        input_ids: torch.LongTensor,
+        constrained_beam_scorer: ConstrainedBeamSearchScorer,
+        logits_processor: LogitsProcessorList,
+        stopping_criteria: StoppingCriteriaList,
+        generation_config: GenerationConfig,
+        synced_gpus: bool,
+        **model_kwargs,
+    ) -> Union[GenerateBeamOutput, torch.LongTensor]:
+        r"""
+        Generates sequences of token ids for models with a language modeling head using **constrained beam search
+        decoding** and can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models.
+
+        Parameters:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                The sequence used as a prompt for the generation.
+            constrained_beam_scorer (`ConstrainedBeamSearchScorer`):
+                A derived instance of [`BeamScorer`] that defines how beam hypotheses are constructed, stored and
+                sorted during generation, while satisfying a list of positive constraints. For more information, the
+                documentation of [`ConstrainedBeamSearchScorer`] should be read.
+            logits_processor (`LogitsProcessorList`):
+                An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`]
+                used to modify the prediction scores of the language modeling head applied at each generation step.
+            stopping_criteria (`StoppingCriteriaList`):
+                An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`]
+                used to tell if the generation loop should stop.
+            generation_config ([`~generation.GenerationConfig`]):
+                The generation configuration to be used as parametrization of the decoding method.
+            synced_gpus (`bool`):
+                Whether to continue running the while loop until max_length (needed to avoid deadlocking with
+                `FullyShardedDataParallel` and DeepSpeed ZeRO Stage 3).
+            model_kwargs:
+                Additional model specific kwargs will be forwarded to the `forward` function of the model. If model is
+                an encoder-decoder model the kwargs should include `encoder_outputs`.
+
+        Return:
+            [`~generation.GenerateBeamDecoderOnlyOutput`], [`~generation.GenerateBeamEncoderDecoderOutput`] or
+            `torch.LongTensor`: A `torch.LongTensor` containing the generated tokens (default behaviour) or a
+            [`~generation.GenerateBeamDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and
+            `return_dict_in_generate=True` or a [`~generation.GenerateBeamEncoderDecoderOutput`] if
+            `model.config.is_encoder_decoder=True`.
+        """
+        # init values
+        pad_token_id = generation_config._pad_token_tensor
+        eos_token_id = generation_config._eos_token_tensor
+        output_attentions = generation_config.output_attentions
+        output_hidden_states = generation_config.output_hidden_states
+        output_scores = generation_config.output_scores
+        output_logits = generation_config.output_logits
+        return_dict_in_generate = generation_config.return_dict_in_generate
+
+        batch_size = len(constrained_beam_scorer._beam_hyps)
+        num_beams = constrained_beam_scorer.num_beams
+
+        batch_beam_size, cur_len = input_ids.shape
+        model_kwargs = self._get_initial_cache_position(input_ids, model_kwargs)
+
+        if num_beams * batch_size != batch_beam_size:
+            raise ValueError(
+                f"Batch dimension of `input_ids` should be {num_beams * batch_size}, but is {batch_beam_size}."
+            )
+
+        # init attention / hidden states / scores tuples
+        scores = () if (return_dict_in_generate and output_scores) else None
+        raw_logits = () if (return_dict_in_generate and output_logits) else None
+        beam_indices = (
+            tuple(() for _ in range(batch_beam_size)) if (return_dict_in_generate and output_scores) else None
+        )
+        decoder_attentions = () if (return_dict_in_generate and output_attentions) else None
+        cross_attentions = () if (return_dict_in_generate and output_attentions) else None
+        decoder_hidden_states = () if (return_dict_in_generate and output_hidden_states) else None
+
+        # if model is an encoder-decoder, retrieve encoder attention weights and hidden states
+        if return_dict_in_generate and self.config.is_encoder_decoder:
+            encoder_attentions = model_kwargs["encoder_outputs"].get("attentions") if output_attentions else None
+            encoder_hidden_states = (
+                model_kwargs["encoder_outputs"].get("hidden_states") if output_hidden_states else None
+            )
+
+        # initialise score of first beam with 0 and the rest with -1e9. This makes sure that only tokens
+        # of the first beam are considered to avoid sampling the exact same tokens across all beams.
+        beam_scores = torch.zeros((batch_size, num_beams), dtype=torch.float, device=input_ids.device)
+        beam_scores[:, 1:] = -1e9
+        beam_scores = beam_scores.view((batch_size * num_beams,))
+
+        this_peer_finished = False
+
+        decoder_prompt_len = input_ids.shape[-1]  # record the prompt length of decoder
+        while self._has_unfinished_sequences(this_peer_finished, synced_gpus, device=input_ids.device):
+            model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
+
+            # prepare variable output controls (note: some models won't accept all output controls)
+            model_inputs.update({"output_attentions": output_attentions} if output_attentions else {})
+            model_inputs.update({"output_hidden_states": output_hidden_states} if output_hidden_states else {})
+
+            outputs = self(**model_inputs, return_dict=True)
+
+            # synced_gpus: don't waste resources running the code we don't need; kwargs must be updated before skipping
+            model_kwargs = self._update_model_kwargs_for_generation(
+                outputs,
+                model_kwargs,
+                is_encoder_decoder=self.config.is_encoder_decoder,
+            )
+            if synced_gpus and this_peer_finished:
+                cur_len = cur_len + 1
+                continue
+
+            # Clone is needed to avoid keeping a hanging ref to outputs.logits which may be very large for first iteration
+            # (the clone itself is always small)
+            # .float() is needed to retain precision for later logits manipulations
+            next_token_logits = outputs.logits[:, -1, :].clone().float()
+            next_token_logits = next_token_logits.to(input_ids.device)
+            next_token_scores = nn.functional.log_softmax(
+                next_token_logits, dim=-1
+            )  # (batch_size * num_beams, vocab_size)
+
+            next_token_scores_processed = logits_processor(input_ids, next_token_scores)
+
+            next_token_scores = next_token_scores_processed + beam_scores[:, None].expand_as(
+                next_token_scores_processed
+            )
+
+            scores_for_all_vocab = next_token_scores.clone()
+
+            # Store scores, attentions and hidden_states when required
+            if return_dict_in_generate:
+                if output_scores:
+                    scores += (next_token_scores,)
+                if output_logits:
+                    raw_logits += (next_token_logits,)
+                if output_attentions:
+                    decoder_attentions += (
+                        (outputs.decoder_attentions,) if self.config.is_encoder_decoder else (outputs.attentions,)
+                    )
+                    if self.config.is_encoder_decoder:
+                        cross_attentions += (outputs.cross_attentions,)
+
+                if output_hidden_states:
+                    decoder_hidden_states += (
+                        (outputs.decoder_hidden_states,)
+                        if self.config.is_encoder_decoder
+                        else (outputs.hidden_states,)
+                    )
+
+            # reshape for beam search
+            vocab_size = next_token_scores.shape[-1]
+            next_token_scores = next_token_scores.view(batch_size, num_beams * vocab_size)
+
+            # Sample 1 + len(eos_token_id) next tokens for each beam so we have at least 1 non eos token per beam.
+            n_eos_tokens = eos_token_id.shape[0] if eos_token_id is not None else 0
+            next_token_scores, next_tokens = torch.topk(
+                next_token_scores, max(2, 1 + n_eos_tokens) * num_beams, dim=1, largest=True, sorted=True
+            )
+
+            next_indices = (next_tokens / vocab_size).long()
+            next_tokens = next_tokens % vocab_size
+
+            # stateless
+            beam_outputs = constrained_beam_scorer.process(
+                input_ids,
+                next_token_scores,
+                next_tokens,
+                next_indices,
+                scores_for_all_vocab,
+                pad_token_id=pad_token_id,
+                eos_token_id=eos_token_id,
+                beam_indices=beam_indices,
+                decoder_prompt_len=decoder_prompt_len,
+            )
+            beam_scores = beam_outputs["next_beam_scores"]
+            beam_next_tokens = beam_outputs["next_beam_tokens"]
+            beam_idx = beam_outputs["next_beam_indices"]
+
+            input_ids = torch.cat([input_ids[beam_idx, :], beam_next_tokens.unsqueeze(-1)], dim=-1)
+
+            # This is needed to properly delete outputs.logits which may be very large for first iteration
+            # Otherwise a reference to outputs is kept which keeps the logits alive in the next iteration
+            # IMPORTANT: Note that this should appear BEFORE the call to _reorder_cache() to save the maximum memory
+            # (that way the memory peak does not include outputs.logits)
+            del outputs
+
+            if model_kwargs.get("past_key_values", None) is not None:
+                model_kwargs["past_key_values"] = self._temporary_reorder_cache(
+                    model_kwargs["past_key_values"], beam_idx
+                )
+
+            if return_dict_in_generate and output_scores:
+                beam_indices = tuple((beam_indices[beam_idx[i]] + (beam_idx[i],) for i in range(len(beam_indices))))
+
+            # increase cur_len
+            cur_len = cur_len + 1
+
+            if constrained_beam_scorer.is_done or all(stopping_criteria(input_ids, scores)):
+                this_peer_finished = True
+
+        sequence_outputs = constrained_beam_scorer.finalize(
+            input_ids,
+            beam_scores,
+            next_tokens,
+            next_indices,
+            pad_token_id=pad_token_id,
+            eos_token_id=eos_token_id,
+            max_length=stopping_criteria.max_length,
+            beam_indices=beam_indices,
+            decoder_prompt_len=decoder_prompt_len,
+        )
+
+        if return_dict_in_generate:
+            if not output_scores:
+                sequence_outputs["sequence_scores"] = None
+            if self.config.is_encoder_decoder:
+                return GenerateBeamEncoderDecoderOutput(
+                    sequences=sequence_outputs["sequences"],
+                    sequences_scores=sequence_outputs["sequence_scores"],
+                    scores=scores,
+                    logits=raw_logits,
+                    beam_indices=sequence_outputs["beam_indices"],
+                    encoder_attentions=encoder_attentions,
+                    encoder_hidden_states=encoder_hidden_states,
+                    decoder_attentions=decoder_attentions,
+                    cross_attentions=cross_attentions,
+                    decoder_hidden_states=decoder_hidden_states,
+                    past_key_values=model_kwargs.get("past_key_values"),
+                )
+            else:
+                return GenerateBeamDecoderOnlyOutput(
+                    sequences=sequence_outputs["sequences"],
+                    sequences_scores=sequence_outputs["sequence_scores"],
+                    scores=scores,
+                    logits=raw_logits,
+                    beam_indices=sequence_outputs["beam_indices"],
+                    attentions=decoder_attentions,
+                    hidden_states=decoder_hidden_states,
+                    past_key_values=model_kwargs.get("past_key_values"),
+                )
+        else:
+            return sequence_outputs["sequences"]
+
+    def _assisted_decoding(
+        self,
+        input_ids: torch.LongTensor,
+        candidate_generator: CandidateGenerator,
+        logits_processor: LogitsProcessorList,
+        stopping_criteria: StoppingCriteriaList,
+        generation_config: GenerationConfig,
+        synced_gpus: bool,
+        streamer: Optional["BaseStreamer"],
+        **model_kwargs,
+    ) -> Union[GenerateNonBeamOutput, torch.LongTensor]:
+        r"""
+        Generates sequences of token ids for models with a language modeling head using **greedy decoding** or
+        **sample** (depending on `do_sample`), assisted by candidate sequences. Assisted generation is an example of a
+        candidate decoding strategy. Can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text
+        models.
+
+        Parameters:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                The sequence used as a prompt for the generation.
+            candidate_generator (`CandidateGenerator`):
+                A derived instance of [`CandidateGenerator`] that defines how candidate sequences are generated. For
+                more information, the documentation of [`CandidateGenerator`] should be read.
+            logits_processor (`LogitsProcessorList`):
+                An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`]
+                used to modify the prediction scores of the language modeling head applied at each generation step.
+            stopping_criteria (`StoppingCriteriaList`):
+                An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`]
+                used to tell if the generation loop should stop.
+            generation_config ([`~generation.GenerationConfig`]):
+                The generation configuration to be used as parametrization of the decoding method.
+            synced_gpus (`bool`):
+                Whether to continue running the while loop until max_length (needed to avoid deadlocking with
+                `FullyShardedDataParallel` and DeepSpeed ZeRO Stage 3).
+            streamer (`BaseStreamer`, *optional*):
+                Streamer object that will be used to stream the generated sequences. Generated tokens are passed
+                through `streamer.put(token_ids)` and the streamer is responsible for any further processing.
+            model_kwargs:
+                Additional model specific keyword arguments will be forwarded to the `forward` function of the model.
+                If model is an encoder-decoder model the kwargs should include `encoder_outputs`.
+
+        Return:
+            [`~generation.GenerateDecoderOnlyOutput`], [`~generation.GenerateEncoderDecoderOutput`] or
+            `torch.LongTensor`: A `torch.LongTensor` containing the generated tokens (default behaviour) or a
+            [`~generation.GenerateDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and
+            `return_dict_in_generate=True` or a [`~generation.GenerateEncoderDecoderOutput`] if
+            `model.config.is_encoder_decoder=True`.
+        """
+        # init values
+        do_sample = generation_config.do_sample
+        output_attentions = generation_config.output_attentions
+        output_hidden_states = generation_config.output_hidden_states
+        output_scores = generation_config.output_scores
+        output_logits = generation_config.output_logits
+        return_dict_in_generate = generation_config.return_dict_in_generate
+
+        # init attention / hidden states / scores tuples
+        scores = () if (return_dict_in_generate and output_scores) else None
+        raw_logits = () if (return_dict_in_generate and output_logits) else None
+        decoder_attentions = () if (return_dict_in_generate and output_attentions) else None
+        cross_attentions = () if (return_dict_in_generate and output_attentions) else None
+        decoder_hidden_states = () if (return_dict_in_generate and output_hidden_states) else None
+
+        # if model is an encoder-decoder, retrieve encoder attention weights and hidden states
+        if return_dict_in_generate and self.config.is_encoder_decoder:
+            encoder_attentions = model_kwargs["encoder_outputs"].get("attentions") if output_attentions else None
+            encoder_hidden_states = (
+                model_kwargs["encoder_outputs"].get("hidden_states") if output_hidden_states else None
+            )
+
+        # keep track of which sequences are already finished
+        batch_size = input_ids.shape[0]
+        unfinished_sequences = torch.ones(batch_size, dtype=torch.long, device=input_ids.device)
+        model_kwargs = self._get_initial_cache_position(input_ids, model_kwargs)
+
+        this_peer_finished = False
+        is_first_iteration = True  # to preserve the same API in the output as other generation methods
+        while self._has_unfinished_sequences(this_peer_finished, synced_gpus, device=input_ids.device):
+            cur_len = input_ids.shape[-1]
+
+            #  1. Fetch candidate sequences from a `CandidateGenerator` and move to the correct device
+            candidate_input_ids, candidate_logits = candidate_generator.get_candidates(input_ids)
+
+            candidate_input_ids = candidate_input_ids.to(self.device)
+            if candidate_logits is not None:
+                candidate_logits = candidate_logits.to(self.device)
+
+            candidate_length = candidate_input_ids.shape[1] - input_ids.shape[1]
+            is_done_candidate = stopping_criteria(candidate_input_ids, None)
+
+            # 2. Use the original model to obtain the next token logits given the candidate sequence. We obtain
+            # `candidate_length + 1` relevant logits from this process: in the event that all candidates are correct,
+            # we use this forward pass to also pick the subsequent logits in the original model.
+
+            # 2.1. Prepare the model inputs
+            candidate_kwargs = copy.copy(model_kwargs)
+            candidate_kwargs = _prepare_attention_mask(
+                candidate_kwargs, candidate_input_ids.shape[1], self.config.is_encoder_decoder
+            )
+            candidate_kwargs = _prepare_token_type_ids(candidate_kwargs, candidate_input_ids.shape[1])
+            if "cache_position" in candidate_kwargs:
+                candidate_kwargs["cache_position"] = torch.cat(
+                    (
+                        candidate_kwargs["cache_position"],
+                        torch.arange(cur_len, cur_len + candidate_length, device=input_ids.device, dtype=torch.long),
+                    ),
+                    dim=0,
+                )
+
+            model_inputs = self.prepare_inputs_for_generation(candidate_input_ids, **candidate_kwargs)
+            if "num_logits_to_keep" in model_inputs:
+                model_inputs["num_logits_to_keep"] = candidate_length + 1
+
+            # 2.2. Run a forward pass on the candidate sequence
+            # prepare variable output controls (note: some models won't accept all output controls)
+            model_inputs.update({"output_attentions": output_attentions} if output_attentions else {})
+            model_inputs.update({"output_hidden_states": output_hidden_states} if output_hidden_states else {})
+
+            outputs = self(**model_inputs)
+
+            # 2.3. Process the new logits
+            # .float() is needed to retain precision for later logits manipulations
+            new_logits = outputs.logits[:, -candidate_length - 1 :].float()  # excludes the input prompt if present
+            new_logits = new_logits.to(input_ids.device)
+            next_token_logits = new_logits.clone()
+            if len(logits_processor) > 0:
+                for i in range(candidate_length + 1):
+                    new_logits[:, i, :] = logits_processor(candidate_input_ids[:, : cur_len + i], new_logits[:, i, :])
+
+            # 3. Select the accepted tokens. There are two possible cases:
+            # Case 1: `do_sample=True` and we have logits for the candidates (originally from speculative decoding)
+            # 👉 Apply algorithm 1 from the speculative decoding paper (https://arxiv.org/pdf/2211.17192.pdf).
+            if do_sample and candidate_logits is not None:
+                valid_tokens, n_matches = _speculative_sampling(
+                    candidate_input_ids,
+                    candidate_logits,
+                    candidate_length,
+                    new_logits,
+                    is_done_candidate,
+                )
+
+            # Case 2: all other cases (originally from assisted generation) 👉 Compare the tokens selected from the
+            # original model logits with the candidate tokens. We can keep the candidate tokens until the first
+            # mismatch, or until the max length is reached.
+            else:
+                if do_sample:
+                    probs = new_logits.softmax(dim=-1)
+                    selected_tokens = torch.multinomial(probs[0, :, :], num_samples=1).squeeze(1)[None, :]
+                else:
+                    selected_tokens = new_logits.argmax(dim=-1)
+
+                candidate_new_tokens = candidate_input_ids[:, cur_len:]
+                n_matches = ((~(candidate_new_tokens == selected_tokens[:, :-1])).cumsum(dim=-1) < 1).sum()
+
+                # Ensure we don't generate beyond max_len or an EOS token
+                if is_done_candidate and n_matches == candidate_length:
+                    n_matches -= 1
+                valid_tokens = selected_tokens[:, : n_matches + 1]
+
+            # 4. Update variables according to the number of matching assistant tokens. Remember: the token generated
+            # by the model after the last candidate match is also valid, as it is generated from a correct sequence.
+            # Because of this last token, assisted generation search reduces to a normal greedy search/sample if there
+            # is no match.
+
+            # 4.1. Get the valid continuation, after the matching tokens
+            input_ids = torch.cat((input_ids, valid_tokens), dim=-1)
+            if streamer is not None:
+                streamer.put(valid_tokens.cpu())
+            new_cur_len = input_ids.shape[-1]
+
+            # 4.2. Discard past key values relative to unused assistant tokens
+            new_cache_size = new_cur_len - 1
+            outputs.past_key_values = _crop_past_key_values(self, outputs.past_key_values, new_cache_size)
+
+            # 5. Update the candidate generation strategy if needed
+            candidate_generator.update_candidate_strategy(input_ids, new_logits, n_matches)
+
+            # synced_gpus: don't waste resources running the code we don't need; kwargs must be updated before skipping
+            model_kwargs = self._update_model_kwargs_for_generation(
+                outputs,
+                model_kwargs,
+                is_encoder_decoder=self.config.is_encoder_decoder,
+                num_new_tokens=n_matches + 1,
+            )
+            if synced_gpus and this_peer_finished:
+                continue
+
+            # Store scores, attentions and hidden_states when required
+            # Assistant: modified to append one tuple element per token, as in the other generation methods.
+            if return_dict_in_generate:
+                newly_added_length = n_matches + 1
+                if output_scores:
+                    scores += tuple(new_logits[:, i, :] for i in range(newly_added_length))
+                if output_logits:
+                    raw_logits += tuple(next_token_logits[:, i, :] for i in range(newly_added_length))
+
+                newly_added_length = new_cur_len if is_first_iteration else newly_added_length
+                if output_attentions:
+                    if self.config.is_encoder_decoder:
+                        cross_attentions = _split_model_outputs(
+                            cross_attentions, outputs.cross_attentions, cur_len, newly_added_length
+                        )
+                        decoder_attentions = _split_model_outputs(
+                            decoder_attentions,
+                            outputs.decoder_attentions,
+                            cur_len,
+                            newly_added_length,
+                            is_decoder_attention=True,
+                        )
+                    # some (V)LLMs have hard requirement on SDPA and thus never return attn
+                    elif outputs.attentions[0] is not None:
+                        decoder_attentions = _split_model_outputs(
+                            decoder_attentions,
+                            outputs.attentions,
+                            cur_len,
+                            newly_added_length,
+                            is_decoder_attention=True,
+                        )
+                if output_hidden_states:
+                    if self.config.is_encoder_decoder:
+                        decoder_hidden_states = _split_model_outputs(
+                            decoder_hidden_states, outputs.decoder_hidden_states, cur_len, newly_added_length
+                        )
+                    else:
+                        decoder_hidden_states = _split_model_outputs(
+                            decoder_hidden_states, outputs.hidden_states, cur_len, newly_added_length
+                        )
+
+            unfinished_sequences = unfinished_sequences & ~stopping_criteria(input_ids, scores)
+            this_peer_finished = unfinished_sequences.max() == 0
+            is_first_iteration = False
+
+        if streamer is not None:
+            streamer.end()
+
+        if (
+            hasattr(candidate_generator, "assistant_model")
+            and candidate_generator.assistant_model.generation_config.num_assistant_tokens_schedule == "heuristic"
+        ):
+            candidate_generator.assistant_model.generation_config.num_assistant_tokens = (
+                candidate_generator.num_assistant_tokens
+            )
+        if return_dict_in_generate:
+            if self.config.is_encoder_decoder:
+                return GenerateEncoderDecoderOutput(
+                    sequences=input_ids,
+                    scores=scores,
+                    logits=raw_logits,
+                    encoder_attentions=encoder_attentions,
+                    encoder_hidden_states=encoder_hidden_states,
+                    decoder_attentions=decoder_attentions,
+                    cross_attentions=cross_attentions,
+                    decoder_hidden_states=decoder_hidden_states,
+                    past_key_values=model_kwargs.get("past_key_values"),
+                )
+            else:
+                return GenerateDecoderOnlyOutput(
+                    sequences=input_ids,
+                    scores=scores,
+                    logits=raw_logits,
+                    attentions=decoder_attentions,
+                    hidden_states=decoder_hidden_states,
+                    past_key_values=model_kwargs.get("past_key_values"),
+                )
+        else:
+            return input_ids
+
+
+def _speculative_sampling(
+    candidate_input_ids,
+    candidate_logits,
+    candidate_length,
+    new_logits,
+    is_done_candidate,
+):
+    """
+    Applies sampling as in the speculative decoding paper (https://arxiv.org/pdf/2211.17192.pdf, algorithm 1). Returns
+    the selected tokens, as well as the number of candidate matches.
+
+    NOTE: Unless otherwise stated, the variable names match those in the paper.
+    """
+    new_candidate_input_ids = candidate_input_ids[:, -candidate_length:]
+    # Gets the probabilities from the logits. q_i and p_i denote the assistant and model probabilities of the tokens
+    # selected by the assistant, respectively.
+    q = candidate_logits.softmax(dim=-1)
+    q_i = q[:, torch.arange(candidate_length), new_candidate_input_ids].squeeze(0, 1)
+    p = new_logits.softmax(dim=-1)
+    p_i = p[:, torch.arange(candidate_length), new_candidate_input_ids].squeeze(0, 1)
+    probability_ratio = p_i / q_i
+
+    # When probability_ratio > 1 (i.e. q_i(x) < p_i(x), or "assistant probability of the candidate token is smaller
+    # than the model probability for the same token"), keep the token. Otherwise reject with p = 1 - probability_ratio
+    # (= keep with p = probability_ratio). Keep all the tokens until the first rejection
+    r_i = torch.rand_like(probability_ratio)
+    is_accepted = r_i <= probability_ratio
+    n_matches = ((~is_accepted).cumsum(dim=-1) < 1).sum()  # this is `n` in algorithm 1
+
+    # Ensure we don't generate beyond max_len or an EOS token (not in algorithm 1, but needed for correct behavior)
+    if is_done_candidate and n_matches == candidate_length:
+        # Output length is assumed to be `n_matches + 1`. Since we won't generate another token with the target model
+        # due to acceptance on EOS we fix `n_matches`
+        n_matches -= 1
+        valid_tokens = new_candidate_input_ids[:, : n_matches + 1]
+    else:
+        # Next token selection: if there is a rejection, adjust the distribution from the main model before sampling.
+        gamma = candidate_logits.shape[1]
+        p_n_plus_1 = p[:, n_matches, :]
+        if n_matches < gamma:
+            q_n_plus_1 = q[:, n_matches, :]
+            p_prime = torch.clamp((p_n_plus_1 - q_n_plus_1), min=0)
+            p_prime.div_(p_prime.sum())
+        else:
+            p_prime = p_n_plus_1
+        t = torch.multinomial(p_prime, num_samples=1).squeeze(1)[None, :]
+
+        # The selected tokens include the matches (if any) plus the next sampled tokens
+        if n_matches > 0:
+            valid_tokens = torch.cat((new_candidate_input_ids[:, :n_matches], t), dim=-1)
+        else:
+            valid_tokens = t
+
+    return valid_tokens, n_matches
+
+
+def _split_model_outputs(outputs, new_outputs, cur_len, added_len, is_decoder_attention=False):
+    """
+    Given the (decoder/cross attentions)/(decoder hidden states) for multiple generated tokens, splits it into a tuple
+    where each member corresponds to a single generated token.
+    """
+    # Retrocompatibility: in our generation functions, the first iteration includes the attention/hidden states for the
+    # prompt.
+    if len(outputs) == 0:
+        new_tuple = ()
+        for layer in new_outputs:
+            last_dim_size = cur_len if is_decoder_attention else layer.shape[-1]
+            new_tuple += (layer[..., :cur_len, :last_dim_size],)
+        outputs += (new_tuple,)
+        # The first iteration contains the prompt + 1 generated token, let's update the length variables accordingly
+        cur_len += 1
+        added_len -= cur_len
+
+    for i in range(added_len):
+        new_tuple = ()
+        for layer in new_outputs:
+            last_dim_size = cur_len + i if is_decoder_attention else layer.shape[-1]
+            new_tuple += (layer[..., i : i + 1, :last_dim_size],)
+        outputs += (new_tuple,)
+    return outputs
+
+
+def _ranking_fast(
+    context_hidden: torch.FloatTensor,
+    next_hidden: torch.FloatTensor,
+    next_top_k_probs: torch.FloatTensor,
+    cosine_matrix_mask: torch.LongTensor,
+    alpha: float,
+    beam_width: int,
+) -> torch.FloatTensor:
+    """
+    Reranks the top_k candidates based on a degeneration penalty (cosine similarity with previous tokens), as described
+    in the paper "A Contrastive Framework for Neural Text Generation". Returns the index of the best candidate for each
+    row in the batch.
+    """
+    norm_context_hidden = context_hidden / context_hidden.norm(dim=2, keepdim=True)
+    norm_next_hidden = next_hidden / next_hidden.norm(dim=2, keepdim=True)
+    cosine_matrix = torch.matmul(norm_context_hidden, norm_next_hidden.transpose(1, 2)).squeeze(-1)  # [B*K, S]
+
+    # Penalize cosine_matrix based on the cosine_matrix_mask (ignore padding positions)
+    # Using a large negative value for masked positions
+    cosine_matrix_mask = cosine_matrix_mask.to(dtype=cosine_matrix.dtype)
+    cosine_matrix_mask = (1 - cosine_matrix_mask) * torch.finfo(cosine_matrix.dtype).min
+    cosine_matrix = cosine_matrix + cosine_matrix_mask
+
+    degeneration_penalty, _ = torch.max(cosine_matrix, dim=-1)  # [B*K]
+    next_top_k_probs = next_top_k_probs.view(-1)  # [B*K]
+    contrastive_score = (1.0 - alpha) * next_top_k_probs - alpha * degeneration_penalty
+    contrastive_score = torch.stack(torch.split(contrastive_score, beam_width))  # [B, K]
+    _, selected_idx = contrastive_score.max(dim=-1)  # [B]
+    return selected_idx
+
+
+def _split(data, full_batch_size: int, num_hidden_layers: int, split_size: int = None):
+    """
+    Takes care of three cases:
+    1. data is a tensor: e.g. last_hidden_state, pooler_output etc. split them on the batch_size dim
+    2. data is a tuple: e.g. hidden_states, attentions etc. Keep the tuple as it is and split each tensor in it and
+       return a list of tuples
+    3. data is a tuple of tuples, e.g. past_key_values. Keep the tuple as it is and split each tuple in it and
+       return a list of tuples of tuples
+    (see documentation of ModelOutput)
+    """
+    if data is None:
+        return [None] * (full_batch_size // split_size)
+    if isinstance(data, torch.Tensor):
+        return [data[i : i + split_size] for i in range(0, full_batch_size, split_size)]
+    # New cache format
+    elif isinstance(data, DynamicCache) or (
+        isinstance(data, EncoderDecoderCache) and isinstance(data.self_attention_cache, DynamicCache)
+    ):
+        return data.batch_split(full_batch_size, split_size, num_hidden_layers)
+    elif isinstance(data, tuple):
+        # If the elements of the tuple are also tuples (e.g., past_key_values in our earlier example)
+        if isinstance(data[0], tuple):
+            return [
+                tuple(tuple(tensor[i : i + split_size] for tensor in inner_tuple) for inner_tuple in data)
+                for i in range(0, full_batch_size, split_size)
+            ]
+
+        else:
+            return [
+                tuple(sub_tensor[i : i + split_size] for sub_tensor in data)
+                for i in range(0, full_batch_size, split_size)
+            ]
+    else:
+        raise TypeError(f"Unexpected attribute type: {type(data)}")
+
+
+def _split_model_inputs(
+    model_input: Union[ModelOutput, Dict], split_size: int, full_batch_size: int, config: PretrainedConfig
+) -> List[Union[ModelOutput, Dict]]:
+    """
+    Split a ModelOutput object (or its subclasses) or Dict into a list of same-class objects based on a specified split
+    size. The input object is dict when it was prepared for forward pass and ModelOutput when it was returned from
+    previous forward pass.
+    """
+    # Edge case: if model_input is None, return a list of Nones
+    # this happens with Whisper where encoder_outputs is None
+    if model_input is None:
+        return [model_input] * (full_batch_size // split_size)
+    # Infer the class from the object
+    model_output_cls = type(model_input)
+    if (full_batch_size % split_size) != 0:
+        raise ValueError("`full_batch_size` must be divisible by `split_size`")
+
+    if split_size > full_batch_size:
+        raise ValueError("`split_size` must be smaller or equal to `full_batch_size`")
+
+    # Helper function to split tensors or tuples of tensors
+
+    # Find all the dataclass fields (e.g., last_hidden_state, pooler_output etc.) and split them
+    keys = (
+        model_input.__dataclass_fields__.keys() if hasattr(model_input, "__dataclass_fields__") else model_input.keys()
+    )
+    # We only keep keys that are in the model_input
+    keys = [k for k in keys if k in model_input]
+    # Here we can have four types of values: tensors, tuples of tensors and booleans, and encoder_outputs which is a
+    # ModelOutput object.
+    # bool should not be split but replicated for each split
+    bool_keys = [k for k in keys if isinstance(model_input[k], bool) or k == "cache_position"]
+    keys_to_ignore = ["cache_position", "encoder_outputs", "num_logits_to_keep"]
+    non_bool_keys = [k for k in keys if not isinstance(model_input[k], bool) and k not in keys_to_ignore]
+
+    num_hidden_layers = config.get_text_config().num_hidden_layers
+
+    # we split the tensors and tuples of tensors
+    data_split_list = [
+        {k: _split(model_input[k], full_batch_size, num_hidden_layers, split_size)[i] for k in non_bool_keys}
+        for i in range(full_batch_size // split_size)
+    ]
+    # bool values are the same and replicated for each split
+    bool_data = {k: model_input[k] for k in bool_keys}
+    # encoder_outputs is a ModelOutput object and should be split by its own
+    if "encoder_outputs" in model_input:
+        encoder_outputs_split = _split_model_inputs(
+            model_input["encoder_outputs"], split_size, full_batch_size, config.get_text_config()
+        )
+        data_split_list = [
+            {**data_split, "encoder_outputs": encoder_outputs_split[i]} for i, data_split in enumerate(data_split_list)
+        ]
+    # num_logits_to_keep should be replicated for each split, similar to bool values
+    if "num_logits_to_keep" in model_input:
+        data_split_list = [
+            {**data_split, "num_logits_to_keep": model_input["num_logits_to_keep"]} for data_split in data_split_list
+        ]
+
+    # Convert each dictionary in the list to an object of the inferred class
+    split_model_inputs: List[Union[ModelOutput, Dict]] = [
+        model_output_cls(**data_split, **bool_data) for data_split in data_split_list
+    ]
+
+    return split_model_inputs
+
+
+def stack_model_outputs(model_outputs: List[ModelOutput], config: PretrainedConfig) -> ModelOutput:
+    """
+    Stack a list of ModelOutput objects (or its subclasses) along the batch_size dimension. The function infers the
+    specific ModelOutput subclass from the list provided.
+    """
+    if not model_outputs:
+        raise ValueError("Input list is empty.")
+
+    # Infer the class from the first object in the list
+    model_output_cls = type(model_outputs[0])
+    num_hidden_layers = config.get_text_config().num_hidden_layers
+
+    # Ensure all objects are of the same type
+    if not all(isinstance(obj, model_output_cls) for obj in model_outputs):
+        raise ValueError("All elements in the list should be of the same type.")
+
+    # Helper function to concat tensors or tuples of tensors
+    def _concat(data):
+        """
+        Reverse of `_split` function above.
+        """
+        if any(data is None for data in data):
+            return None
+        if isinstance(data[0], torch.Tensor):
+            return torch.cat(data, dim=0)
+        # New cache format
+        elif isinstance(data[0], DynamicCache):
+            return DynamicCache.from_batch_splits(data, num_hidden_layers=num_hidden_layers)
+        elif isinstance(data[0], EncoderDecoderCache):
+            return EncoderDecoderCache.from_batch_splits(data, num_hidden_layers=num_hidden_layers)
+        elif isinstance(data[0], tuple):
+            # If the elements of the tuple are also tuples (e.g., past_key_values in our earlier example)
+            if isinstance(data[0][0], tuple):
+                return tuple(
+                    tuple(torch.cat([attr[i][j] for attr in data], dim=0) for j in range(len(data[0][0])))
+                    for i in range(len(data[0]))
+                )
+            else:
+                return tuple(torch.cat([attr[i] for attr in data], dim=0) for i in range(len(data[0])))
+        elif isinstance(data[0], (int, float)):
+            # If the elements are integers or floats, return a tensor
+            return torch.tensor(data)
+        else:
+            raise TypeError(f"Unexpected attribute type: {type(data[0])}")
+
+    # Use a dictionary comprehension to gather attributes from all objects and concatenate them
+    concatenated_data = {
+        k: _concat([getattr(model_output, k) for model_output in model_outputs])
+        for k in model_output_cls.__dataclass_fields__.keys()
+    }
+
+    # Return a new object of the inferred class with the concatenated attributes
+    return model_output_cls(**concatenated_data)
+
+
+def _relative_top_filter(
+    scores: torch.FloatTensor,
+    baseline_scores: torch.FloatTensor,
+    relative_top: float = 0.1,
+    filter_value: float = -float("Inf"),
+    base_filter_value=-1e-3,
+    min_tokens_to_keep: int = 1,
+) -> torch.FloatTensor:
+    """
+    Reference: https://github.com/XiangLi1999/ContrastiveDecoding/blob/170e9142e92159c1237d731e240f5eb14aabf428/transformers/src/transformers/generation_logits_process.py#L235
+    Apply filtering to only keep tokens with a probability above a certain threshold. The threshold is defined as `relative_top` * max probability in the distribution.
+    """
+    scores_normalized = scores.log_softmax(dim=-1)
+    baseline_scores_normalized = baseline_scores.log_softmax(dim=-1)
+    sorted_logits, sorted_indices = torch.sort(scores_normalized, descending=True)
+    min_thresh = sorted_logits[..., min_tokens_to_keep - 1]
+    probs_max = torch.max(scores_normalized, dim=-1).values
+    probs_thresh = probs_max + np.log(relative_top)
+    probs_thresh = torch.min(min_thresh, probs_thresh)
+    probs_thresh = probs_thresh.unsqueeze(-1)
+    baseline_scores_normalized[scores_normalized < probs_thresh] = base_filter_value
+    scores_normalized[scores_normalized < probs_thresh] = filter_value
+    return scores_normalized, baseline_scores_normalized
+
+
+def _dola_select_contrast(
+    candidate_premature_layers: List[int],
+    candidate_premature_logits: Dict[int, torch.FloatTensor],
+    final_logits: torch.FloatTensor,
+) -> torch.FloatTensor:
+    if len(candidate_premature_layers) == 1:
+        base_logits = candidate_premature_logits[candidate_premature_layers[0]]
+        final_logits, base_logits = _relative_top_filter(final_logits, base_logits)
+        logits = final_logits - base_logits
+        return logits
+
+    # 1. Stacking all premature_layers into a new dimension
+    stacked_premature_layers = torch.stack([candidate_premature_logits[i] for i in candidate_premature_layers], dim=0)
+
+    # 2. Calculate the softmax values for mature_layer and all premature_layers
+    # shape: (batch_size, vocab_size)
+    softmax_mature_layer = F.softmax(final_logits, dim=-1)
+    # shape: (num_premature_layers, batch_size, vocab_size)
+    softmax_premature_layers = F.softmax(stacked_premature_layers, dim=-1)
+
+    # 3. Calculate the average distribution
+    # shape: (num_premature_layers, batch_size, vocab_size)
+    avg_dist = 0.5 * (softmax_mature_layer[None, :, :] + softmax_premature_layers)
+
+    # 4. Calculate log-softmax for the KL divergence
+    # shape: (batch_size, vocab_size)
+    log_softmax_mature_layer = F.log_softmax(final_logits, dim=-1)
+    # shape: (num_premature_layers, batch_size, vocab_size)
+    log_softmax_premature_layers = F.log_softmax(stacked_premature_layers, dim=-1)
+
+    # 5. Calculate the KL divergences and then the JS divergences
+    # shape: (num_premature_layers, batch_size)
+    kl1 = F.kl_div(log_softmax_mature_layer[None, :, :], avg_dist, reduction="none").mean(-1)
+    # shape: (num_premature_layers, batch_size)
+    kl2 = F.kl_div(log_softmax_premature_layers, avg_dist, reduction="none").mean(-1)
+    js_divs = 0.5 * (kl1 + kl2)  # shape: (num_premature_layers, batch_size)
+
+    # 6. Reduce the batchmean
+    js_divs = js_divs.mean(-1)  # shape: (num_premature_layers,)
+    premature_layer = candidate_premature_layers[int(js_divs.argmax().cpu().item())]
+
+    base_logits = candidate_premature_logits[premature_layer]
+    final_logits, base_logits = _relative_top_filter(final_logits, base_logits)
+    logits = final_logits - base_logits
+    return logits
diff --git a/.venv/lib/python3.11/site-packages/transformers/generation/watermarking.py b/.venv/lib/python3.11/site-packages/transformers/generation/watermarking.py
new file mode 100644
index 0000000000000000000000000000000000000000..da90c03dd0da89cdf98b2eede63f1eca2b3b9197
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/generation/watermarking.py
@@ -0,0 +1,549 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team and Google DeepMind.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collections
+from dataclasses import dataclass
+from functools import lru_cache
+from typing import Any, Dict, Optional, Tuple, Union
+
+import numpy as np
+import torch
+from torch import nn
+from torch.nn import BCELoss
+
+from ..modeling_utils import PreTrainedModel
+from ..utils import ModelOutput, is_torch_available, logging
+from .configuration_utils import PretrainedConfig, WatermarkingConfig
+
+
+if is_torch_available():
+    import torch
+
+    from .logits_process import SynthIDTextWatermarkLogitsProcessor, WatermarkLogitsProcessor
+
+
+logger = logging.get_logger(__name__)
+
+
+@dataclass
+class WatermarkDetectorOutput:
+    """
+    Outputs of a watermark detector.
+
+    Args:
+        num_tokens_scored (np.array of shape (batch_size)):
+            Array containing the number of tokens scored for each element in the batch.
+        num_green_tokens (np.array of shape (batch_size)):
+            Array containing the number of green tokens for each element in the batch.
+        green_fraction (np.array of shape (batch_size)):
+            Array containing the fraction of green tokens for each element in the batch.
+        z_score (np.array of shape (batch_size)):
+            Array containing the z-score for each element in the batch. Z-score here shows
+            how many standard deviations away is the green token count in the input text
+            from the expected green token count for machine-generated text.
+        p_value (np.array of shape (batch_size)):
+            Array containing the p-value for each batch obtained from z-scores.
+        prediction (np.array of shape (batch_size)), *optional*:
+            Array containing boolean predictions whether a text is machine-generated for each element in the batch.
+        confidence (np.array of shape (batch_size)), *optional*:
+            Array containing confidence scores of a text being machine-generated for each element in the batch.
+    """
+
+    num_tokens_scored: np.array = None
+    num_green_tokens: np.array = None
+    green_fraction: np.array = None
+    z_score: np.array = None
+    p_value: np.array = None
+    prediction: Optional[np.array] = None
+    confidence: Optional[np.array] = None
+
+
+class WatermarkDetector:
+    r"""
+    Detector for detection of watermark generated text. The detector needs to be given the exact same settings that were
+    given during text generation to replicate the watermark greenlist generation and so detect the watermark. This includes
+    the correct device that was used during text generation, the correct watermarking arguments and the correct tokenizer vocab size.
+    The code was based on the [original repo](https://github.com/jwkirchenbauer/lm-watermarking/tree/main).
+
+    See [the paper](https://arxiv.org/abs/2306.04634) for more information.
+
+    Args:
+        model_config (`PretrainedConfig`):
+            The model config that will be used to get model specific arguments used when generating.
+        device (`str`):
+            The device which was used during watermarked text generation.
+        watermarking_config (Union[`WatermarkingConfig`, `Dict`]):
+            The exact same watermarking config and arguments used when generating text.
+        ignore_repeated_ngrams (`bool`, *optional*, defaults to `False`):
+            Whether to count every unique ngram only once or not.
+        max_cache_size (`int`, *optional*, defaults to 128):
+            The max size to be used for LRU caching of seeding/sampling algorithms called for every token.
+
+    Examples:
+
+    ```python
+    >>> from transformers import AutoTokenizer, AutoModelForCausalLM, WatermarkDetector, WatermarkingConfig
+
+    >>> model_id = "openai-community/gpt2"
+    >>> model = AutoModelForCausalLM.from_pretrained(model_id)
+    >>> tok = AutoTokenizer.from_pretrained(model_id)
+    >>> tok.pad_token_id = tok.eos_token_id
+    >>> tok.padding_side = "left"
+
+    >>> inputs = tok(["This is the beginning of a long story", "Alice and Bob are"], padding=True, return_tensors="pt")
+    >>> input_len = inputs["input_ids"].shape[-1]
+
+    >>> # first generate text with watermark and without
+    >>> watermarking_config = WatermarkingConfig(bias=2.5, seeding_scheme="selfhash")
+    >>> out_watermarked = model.generate(**inputs, watermarking_config=watermarking_config, do_sample=False, max_length=20)
+    >>> out = model.generate(**inputs, do_sample=False, max_length=20)
+
+    >>> # now we can instantiate the detector and check the generated text
+    >>> detector = WatermarkDetector(model_config=model.config, device="cpu", watermarking_config=watermarking_config)
+    >>> detection_out_watermarked = detector(out_watermarked, return_dict=True)
+    >>> detection_out = detector(out, return_dict=True)
+    >>> detection_out_watermarked.prediction
+    array([ True,  True])
+
+    >>> detection_out.prediction
+    array([False,  False])
+    ```
+    """
+
+    def __init__(
+        self,
+        model_config: PretrainedConfig,
+        device: str,
+        watermarking_config: Union[WatermarkingConfig, Dict],
+        ignore_repeated_ngrams: bool = False,
+        max_cache_size: int = 128,
+    ):
+        if isinstance(watermarking_config, WatermarkingConfig):
+            watermarking_config = watermarking_config.to_dict()
+
+        self.bos_token_id = (
+            model_config.bos_token_id if not model_config.is_encoder_decoder else model_config.decoder_start_token_id
+        )
+        self.greenlist_ratio = watermarking_config["greenlist_ratio"]
+        self.ignore_repeated_ngrams = ignore_repeated_ngrams
+        self.processor = WatermarkLogitsProcessor(
+            vocab_size=model_config.vocab_size, device=device, **watermarking_config
+        )
+
+        # Expensive re-seeding and sampling is cached.
+        self._get_ngram_score_cached = lru_cache(maxsize=max_cache_size)(self._get_ngram_score)
+
+    def _get_ngram_score(self, prefix: torch.LongTensor, target: int):
+        greenlist_ids = self.processor._get_greenlist_ids(prefix)
+        return target in greenlist_ids
+
+    def _score_ngrams_in_passage(self, input_ids: torch.LongTensor):
+        batch_size, seq_length = input_ids.shape
+        selfhash = int(self.processor.seeding_scheme == "selfhash")
+        n = self.processor.context_width + 1 - selfhash
+        indices = torch.arange(n).unsqueeze(0) + torch.arange(seq_length - n + 1).unsqueeze(1)
+        ngram_tensors = input_ids[:, indices]
+
+        num_tokens_scored_batch = np.zeros(batch_size)
+        green_token_count_batch = np.zeros(batch_size)
+        for batch_idx in range(ngram_tensors.shape[0]):
+            frequencies_table = collections.Counter(ngram_tensors[batch_idx])
+            ngram_to_watermark_lookup = {}
+            for ngram_example in frequencies_table.keys():
+                prefix = ngram_example if selfhash else ngram_example[:-1]
+                target = ngram_example[-1]
+                ngram_to_watermark_lookup[ngram_example] = self._get_ngram_score_cached(prefix, target)
+
+            if self.ignore_repeated_ngrams:
+                # counts a green/red hit once per unique ngram.
+                # num total tokens scored becomes the number unique ngrams.
+                num_tokens_scored_batch[batch_idx] = len(frequencies_table.keys())
+                green_token_count_batch[batch_idx] = sum(ngram_to_watermark_lookup.values())
+            else:
+                num_tokens_scored_batch[batch_idx] = sum(frequencies_table.values())
+                green_token_count_batch[batch_idx] = sum(
+                    freq * outcome
+                    for freq, outcome in zip(frequencies_table.values(), ngram_to_watermark_lookup.values())
+                )
+        return num_tokens_scored_batch, green_token_count_batch
+
+    def _compute_z_score(self, green_token_count: np.array, total_num_tokens: np.array) -> np.array:
+        expected_count = self.greenlist_ratio
+        numer = green_token_count - expected_count * total_num_tokens
+        denom = np.sqrt(total_num_tokens * expected_count * (1 - expected_count))
+        z = numer / denom
+        return z
+
+    def _compute_pval(self, x, loc=0, scale=1):
+        z = (x - loc) / scale
+        return 1 - (0.5 * (1 + np.sign(z) * (1 - np.exp(-2 * z**2 / np.pi))))
+
+    def __call__(
+        self,
+        input_ids: torch.LongTensor,
+        z_threshold: float = 3.0,
+        return_dict: bool = False,
+    ) -> Union[WatermarkDetectorOutput, np.array]:
+        """
+                Args:
+                input_ids (`torch.LongTensor`):
+                    The watermark generated text. It is advised to remove the prompt, which can affect the detection.
+                z_threshold (`Dict`, *optional*, defaults to `3.0`):
+                    Changing this threshold will change the sensitivity of the detector. Higher z threshold gives less
+                    sensitivity and vice versa for lower z threshold.
+                return_dict (`bool`,  *optional*, defaults to `False`):
+                    Whether to return `~generation.WatermarkDetectorOutput` or not. If not it will return boolean predictions,
+        ma
+                Return:
+                    [`~generation.WatermarkDetectorOutput`] or `np.array`: A [`~generation.WatermarkDetectorOutput`]
+                    if `return_dict=True` otherwise a `np.array`.
+
+        """
+
+        # Let's assume that if one batch start with `bos`, all batched also do
+        if input_ids[0, 0] == self.bos_token_id:
+            input_ids = input_ids[:, 1:]
+
+        if input_ids.shape[-1] - self.processor.context_width < 1:
+            raise ValueError(
+                f"Must have at least `1` token to score after the first "
+                f"min_prefix_len={self.processor.context_width} tokens required by the seeding scheme."
+            )
+
+        num_tokens_scored, green_token_count = self._score_ngrams_in_passage(input_ids)
+        z_score = self._compute_z_score(green_token_count, num_tokens_scored)
+        prediction = z_score > z_threshold
+
+        if return_dict:
+            p_value = self._compute_pval(z_score)
+            confidence = 1 - p_value
+
+            return WatermarkDetectorOutput(
+                num_tokens_scored=num_tokens_scored,
+                num_green_tokens=green_token_count,
+                green_fraction=green_token_count / num_tokens_scored,
+                z_score=z_score,
+                p_value=p_value,
+                prediction=prediction,
+                confidence=confidence,
+            )
+        return prediction
+
+
+class BayesianDetectorConfig(PretrainedConfig):
+    """
+    This is the configuration class to store the configuration of a [`BayesianDetectorModel`]. It is used to
+    instantiate a Bayesian Detector model according to the specified arguments.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        watermarking_depth (`int`, *optional*):
+            The number of tournament layers.
+        base_rate (`float1`, *optional*, defaults to 0.5):
+            Prior probability P(w) that a text is watermarked.
+    """
+
+    def __init__(self, watermarking_depth: int = None, base_rate: float = 0.5, **kwargs):
+        self.watermarking_depth = watermarking_depth
+        self.base_rate = base_rate
+        # These can be set later to store information about this detector.
+        self.model_name = None
+        self.watermarking_config = None
+
+        super().__init__(**kwargs)
+
+    def set_detector_information(self, model_name, watermarking_config):
+        self.model_name = model_name
+        self.watermarking_config = watermarking_config
+
+
+@dataclass
+class BayesianWatermarkDetectorModelOutput(ModelOutput):
+    """
+    Base class for outputs of models predicting if the text is watermarked.
+
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Language modeling loss.
+        posterior_probabilities (`torch.FloatTensor` of shape `(1,)`):
+            Multiple choice classification loss.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    posterior_probabilities: Optional[torch.FloatTensor] = None
+
+
+class BayesianDetectorWatermarkedLikelihood(nn.Module):
+    """Watermarked likelihood model for binary-valued g-values.
+
+    This takes in g-values and returns p(g_values|watermarked).
+    """
+
+    def __init__(self, watermarking_depth: int):
+        """Initializes the model parameters."""
+        super().__init__()
+        self.watermarking_depth = watermarking_depth
+        self.beta = torch.nn.Parameter(-2.5 + 0.001 * torch.randn(1, 1, watermarking_depth))
+        self.delta = torch.nn.Parameter(0.001 * torch.randn(1, 1, self.watermarking_depth, watermarking_depth))
+
+    def _compute_latents(self, g_values: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Computes the unique token probability distribution given g-values.
+
+        Args:
+            g_values (`torch.Tensor` of shape `(batch_size, seq_len, watermarking_depth)`):
+                PRF values.
+
+        Returns:
+            p_one_unique_token and p_two_unique_tokens, both of shape
+            [batch_size, seq_len, watermarking_depth]. p_one_unique_token[i,t,l]
+            gives the probability of there being one unique token in a tournament
+            match on layer l, on timestep t, for batch item i.
+            p_one_unique_token[i,t,l] + p_two_unique_token[i,t,l] = 1.
+        """
+        # Tile g-values to produce feature vectors for predicting the latents
+        # for each layer in the tournament; our model for the latents psi is a
+        # logistic regression model psi = sigmoid(delta * x + beta).
+
+        # [batch_size, seq_len, watermarking_depth, watermarking_depth]
+        x = torch.repeat_interleave(torch.unsqueeze(g_values, dim=-2), self.watermarking_depth, axis=-2)
+
+        # mask all elements above -1 diagonal for autoregressive factorization
+        x = torch.tril(x, diagonal=-1)
+
+        # [batch_size, seq_len, watermarking_depth]
+        # (i, j, k, l) x (i, j, k, l) -> (i, j, k) einsum equivalent
+        logits = (self.delta[..., None, :] @ x.type(self.delta.dtype)[..., None]).squeeze() + self.beta
+
+        p_two_unique_tokens = torch.sigmoid(logits)
+        p_one_unique_token = 1 - p_two_unique_tokens
+        return p_one_unique_token, p_two_unique_tokens
+
+    def forward(self, g_values: torch.Tensor) -> torch.Tensor:
+        """Computes the likelihoods P(g_values|watermarked).
+
+        Args:
+            g_values (`torch.Tensor` of shape `(batch_size, seq_len, watermarking_depth)`):
+                g-values (values 0 or 1)
+
+        Returns:
+            p(g_values|watermarked) of shape [batch_size, seq_len, watermarking_depth].
+        """
+        p_one_unique_token, p_two_unique_tokens = self._compute_latents(g_values)
+
+        # P(g_tl | watermarked) is equal to
+        # 0.5 * [ (g_tl+0.5) * p_two_unique_tokens + p_one_unique_token].
+        return 0.5 * ((g_values + 0.5) * p_two_unique_tokens + p_one_unique_token)
+
+
+class BayesianDetectorModel(PreTrainedModel):
+    r"""
+    Bayesian classifier for watermark detection.
+
+    This detector uses Bayes' rule to compute a watermarking score, which is the sigmoid of the log of ratio of the
+    posterior probabilities P(watermarked|g_values) and P(unwatermarked|g_values). Please see the section on
+    BayesianScore in the paper for further details.
+    Paper URL: https://www.nature.com/articles/s41586-024-08025-4
+
+    Note that this detector only works with non-distortionary Tournament-based watermarking using the Bernoulli(0.5)
+    g-value distribution.
+
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`BayesianDetectorConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+    """
+
+    config_class = BayesianDetectorConfig
+    base_model_prefix = "model"
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.watermarking_depth = config.watermarking_depth
+        self.base_rate = config.base_rate
+        self.likelihood_model_watermarked = BayesianDetectorWatermarkedLikelihood(
+            watermarking_depth=self.watermarking_depth
+        )
+        self.prior = torch.nn.Parameter(torch.tensor([self.base_rate]))
+
+    def _init_weights(self, module):
+        """Initialize the weights."""
+        if isinstance(module, nn.Parameter):
+            module.weight.data.normal_(mean=0.0, std=0.02)
+
+    def _compute_posterior(
+        self,
+        likelihoods_watermarked: torch.Tensor,
+        likelihoods_unwatermarked: torch.Tensor,
+        mask: torch.Tensor,
+        prior: float,
+    ) -> torch.Tensor:
+        """
+        Compute posterior P(w|g) given likelihoods, mask and prior.
+
+        Args:
+            likelihoods_watermarked (`torch.Tensor` of shape `(batch, length, depth)`):
+                Likelihoods P(g_values|watermarked) of g-values under watermarked model.
+            likelihoods_unwatermarked (`torch.Tensor` of shape `(batch, length, depth)`):
+                Likelihoods P(g_values|unwatermarked) of g-values under unwatermarked model.
+            mask (`torch.Tensor` of shape `(batch, length)`):
+                A binary array indicating which g-values should be used. g-values with mask value 0 are discarded.
+            prior (`float`):
+                the prior probability P(w) that the text is watermarked.
+
+        Returns:
+            Posterior probability P(watermarked|g_values), shape [batch].
+        """
+        mask = torch.unsqueeze(mask, dim=-1)
+        prior = torch.clamp(prior, min=1e-5, max=1 - 1e-5)
+        log_likelihoods_watermarked = torch.log(torch.clamp(likelihoods_watermarked, min=1e-30, max=float("inf")))
+        log_likelihoods_unwatermarked = torch.log(torch.clamp(likelihoods_unwatermarked, min=1e-30, max=float("inf")))
+        log_odds = log_likelihoods_watermarked - log_likelihoods_unwatermarked
+
+        # Sum relative surprisals (log odds) across all token positions and layers.
+        relative_surprisal_likelihood = torch.einsum("i...->i", log_odds * mask)
+
+        # Compute the relative surprisal prior
+        relative_surprisal_prior = torch.log(prior) - torch.log(1 - prior)
+
+        # Combine prior and likelihood.
+        # [batch_size]
+        relative_surprisal = relative_surprisal_prior + relative_surprisal_likelihood
+
+        # Compute the posterior probability P(w|g) = sigmoid(relative_surprisal).
+        return torch.sigmoid(relative_surprisal)
+
+    def forward(
+        self,
+        g_values: torch.Tensor,
+        mask: torch.Tensor,
+        labels: Optional[torch.Tensor] = None,
+        loss_batch_weight=1,
+        return_dict=False,
+    ) -> BayesianWatermarkDetectorModelOutput:
+        """
+        Computes the watermarked posterior P(watermarked|g_values).
+
+        Args:
+            g_values (`torch.Tensor` of shape `(batch_size, seq_len, watermarking_depth, ...)`):
+                g-values (with values 0 or 1)
+            mask:
+                A binary array shape [batch_size, seq_len] indicating which g-values should be used. g-values with mask
+                value 0 are discarded.
+
+        Returns:
+            p(watermarked | g_values), of shape [batch_size].
+        """
+
+        likelihoods_watermarked = self.likelihood_model_watermarked(g_values)
+        likelihoods_unwatermarked = 0.5 * torch.ones_like(g_values)
+        out = self._compute_posterior(
+            likelihoods_watermarked=likelihoods_watermarked,
+            likelihoods_unwatermarked=likelihoods_unwatermarked,
+            mask=mask,
+            prior=self.prior,
+        )
+
+        loss = None
+        if labels is not None:
+            loss_fct = BCELoss()
+            loss_unwweight = torch.sum(self.likelihood_model_watermarked.delta**2)
+            loss_weight = loss_unwweight * loss_batch_weight
+            loss = loss_fct(torch.clamp(out, 1e-5, 1 - 1e-5), labels) + loss_weight
+
+        if not return_dict:
+            return (out,) if loss is None else (out, loss)
+
+        return BayesianWatermarkDetectorModelOutput(loss=loss, posterior_probabilities=out)
+
+
+class SynthIDTextWatermarkDetector:
+    r"""
+    SynthID text watermark detector class.
+
+    This class has to be initialized with the trained bayesian detector module check script
+    in examples/synthid_text/detector_training.py for example in training/saving/loading this
+    detector module. The folder also showcases example use case of this detector.
+
+    Parameters:
+        detector_module ([`BayesianDetectorModel`]):
+            Bayesian detector module object initialized with parameters.
+            Check examples/research_projects/synthid_text/detector_training.py for usage.
+        logits_processor (`SynthIDTextWatermarkLogitsProcessor`):
+            The logits processor used for watermarking.
+        tokenizer (`Any`):
+            The tokenizer used for the model.
+
+    Examples:
+    ```python
+    >>> from transformers import (
+    ...     AutoTokenizer, BayesianDetectorModel, SynthIDTextWatermarkLogitsProcessor, SynthIDTextWatermarkDetector
+    ... )
+
+    >>> # Load the detector. See examples/research_projects/synthid_text for training a detector.
+    >>> detector_model = BayesianDetectorModel.from_pretrained("joaogante/dummy_synthid_detector")
+    >>> logits_processor = SynthIDTextWatermarkLogitsProcessor(
+    ...     **detector_model.config.watermarking_config, device="cpu"
+    ... )
+    >>> tokenizer = AutoTokenizer.from_pretrained(detector_model.config.model_name)
+    >>> detector = SynthIDTextWatermarkDetector(detector_model, logits_processor, tokenizer)
+
+    >>> # Test whether a certain string is watermarked
+    >>> test_input = tokenizer(["This is a test input"], return_tensors="pt")
+    >>> is_watermarked = detector(test_input.input_ids)
+    ```
+    """
+
+    def __init__(
+        self,
+        detector_module: BayesianDetectorModel,
+        logits_processor: SynthIDTextWatermarkLogitsProcessor,
+        tokenizer: Any,
+    ):
+        self.detector_module = detector_module
+        self.logits_processor = logits_processor
+        self.tokenizer = tokenizer
+
+    def __call__(self, tokenized_outputs: torch.Tensor):
+        # eos mask is computed, skip first ngram_len - 1 tokens
+        # eos_mask will be of shape [batch_size, output_len]
+        eos_token_mask = self.logits_processor.compute_eos_token_mask(
+            input_ids=tokenized_outputs,
+            eos_token_id=self.tokenizer.eos_token_id,
+        )[:, self.logits_processor.ngram_len - 1 :]
+
+        # context repetition mask is computed
+        context_repetition_mask = self.logits_processor.compute_context_repetition_mask(
+            input_ids=tokenized_outputs,
+        )
+        # context repitition mask shape [batch_size, output_len - (ngram_len - 1)]
+
+        combined_mask = context_repetition_mask * eos_token_mask
+
+        g_values = self.logits_processor.compute_g_values(
+            input_ids=tokenized_outputs,
+        )
+        # g values shape [batch_size, output_len - (ngram_len - 1), depth]
+        return self.detector_module(g_values, combined_mask)
diff --git a/.venv/lib/python3.11/site-packages/transformers/kernels/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/kernels/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d758974b8c6aac09cf4f2e56edcd7baf6e0c8422
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/kernels/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/kernels/deformable_detr/cpu/ms_deform_attn_cpu.cpp b/.venv/lib/python3.11/site-packages/transformers/kernels/deformable_detr/cpu/ms_deform_attn_cpu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..388a73d22d4c9b561e2a887b50a1897b8cf2def9
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/kernels/deformable_detr/cpu/ms_deform_attn_cpu.cpp
@@ -0,0 +1,40 @@
+/*!
+**************************************************************************************************
+* Deformable DETR
+* Copyright (c) 2020 SenseTime. All Rights Reserved.
+* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+**************************************************************************************************
+* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+**************************************************************************************************
+*/
+
+#include <vector>
+
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+
+
+at::Tensor
+ms_deform_attn_cpu_forward(
+    const at::Tensor &value, 
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const int im2col_step)
+{
+    AT_ERROR("Not implement on cpu");
+}
+
+std::vector<at::Tensor>
+ms_deform_attn_cpu_backward(
+    const at::Tensor &value, 
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const at::Tensor &grad_output,
+    const int im2col_step)
+{
+    AT_ERROR("Not implement on cpu");
+}
diff --git a/.venv/lib/python3.11/site-packages/transformers/kernels/deformable_detr/cpu/ms_deform_attn_cpu.h b/.venv/lib/python3.11/site-packages/transformers/kernels/deformable_detr/cpu/ms_deform_attn_cpu.h
new file mode 100644
index 0000000000000000000000000000000000000000..7eac8c8bcd1bf529bb9c13d54d2d4215c9e4c89f
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/kernels/deformable_detr/cpu/ms_deform_attn_cpu.h
@@ -0,0 +1,32 @@
+/*!
+**************************************************************************************************
+* Deformable DETR
+* Copyright (c) 2020 SenseTime. All Rights Reserved.
+* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+**************************************************************************************************
+* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+**************************************************************************************************
+*/
+
+#pragma once
+#include <torch/extension.h>
+
+at::Tensor
+ms_deform_attn_cpu_forward(
+    const at::Tensor &value, 
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const int im2col_step);
+
+std::vector<at::Tensor>
+ms_deform_attn_cpu_backward(
+    const at::Tensor &value, 
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const at::Tensor &grad_output,
+    const int im2col_step);
+
diff --git a/.venv/lib/python3.11/site-packages/transformers/kernels/deformable_detr/cuda/ms_deform_attn_cuda.cu b/.venv/lib/python3.11/site-packages/transformers/kernels/deformable_detr/cuda/ms_deform_attn_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..0cd34f5df8b7dc917f5e4332d63c33d2da92d195
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/kernels/deformable_detr/cuda/ms_deform_attn_cuda.cu
@@ -0,0 +1,159 @@
+/*!
+**************************************************************************************************
+* Deformable DETR
+* Copyright (c) 2020 SenseTime. All Rights Reserved.
+* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+**************************************************************************************************
+* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+**************************************************************************************************
+*/
+
+#include <vector>
+#include "cuda/ms_deform_im2col_cuda.cuh"
+
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#pragma once
+#include <torch/extension.h>
+
+
+at::Tensor ms_deform_attn_cuda_forward(
+    const at::Tensor &value, 
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const int im2col_step)
+{
+    at::DeviceGuard guard(value.device());
+
+    AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous");
+    AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous");
+    AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous");
+    AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous");
+    AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous");
+
+    AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor");
+    AT_ASSERTM(spatial_shapes.type().is_cuda(), "spatial_shapes must be a CUDA tensor");
+    AT_ASSERTM(level_start_index.type().is_cuda(), "level_start_index must be a CUDA tensor");
+    AT_ASSERTM(sampling_loc.type().is_cuda(), "sampling_loc must be a CUDA tensor");
+    AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor");
+
+    const int batch = value.size(0);
+    const int spatial_size = value.size(1);
+    const int num_heads = value.size(2);
+    const int channels = value.size(3);
+
+    const int num_levels = spatial_shapes.size(0);
+
+    const int num_query = sampling_loc.size(1);
+    const int num_point = sampling_loc.size(4);
+
+    const int im2col_step_ = std::min(batch, im2col_step);
+
+    AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_);
+    
+    auto output = at::zeros({batch, num_query, num_heads, channels}, value.options());
+
+    const int batch_n = im2col_step_;
+    auto output_n = output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels});
+    auto per_value_size = spatial_size * num_heads * channels;
+    auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2;
+    auto per_attn_weight_size = num_query * num_heads * num_levels * num_point;
+    for (int n = 0; n < batch/im2col_step_; ++n)
+    {
+        auto columns = output_n.select(0, n);
+        AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, value.type(), "ms_deform_attn_forward_cuda", ([&] {
+            ms_deformable_im2col_cuda(at::cuda::getCurrentCUDAStream(),
+                value.data<scalar_t>() + n * im2col_step_ * per_value_size,
+                spatial_shapes.data<int64_t>(),
+                level_start_index.data<int64_t>(),
+                sampling_loc.data<scalar_t>() + n * im2col_step_ * per_sample_loc_size,
+                attn_weight.data<scalar_t>() + n * im2col_step_ * per_attn_weight_size,
+                batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point,
+                columns.data<scalar_t>());
+
+        }));
+    }
+
+    output = output.view({batch, num_query, num_heads*channels});
+
+    return output;
+}
+
+
+std::vector<at::Tensor> ms_deform_attn_cuda_backward(
+    const at::Tensor &value, 
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const at::Tensor &grad_output,
+    const int im2col_step)
+{
+    at::DeviceGuard guard(value.device());
+
+    AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous");
+    AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous");
+    AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous");
+    AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous");
+    AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous");
+    AT_ASSERTM(grad_output.is_contiguous(), "grad_output tensor has to be contiguous");
+
+    AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor");
+    AT_ASSERTM(spatial_shapes.type().is_cuda(), "spatial_shapes must be a CUDA tensor");
+    AT_ASSERTM(level_start_index.type().is_cuda(), "level_start_index must be a CUDA tensor");
+    AT_ASSERTM(sampling_loc.type().is_cuda(), "sampling_loc must be a CUDA tensor");
+    AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor");
+    AT_ASSERTM(grad_output.type().is_cuda(), "grad_output must be a CUDA tensor");
+
+    const int batch = value.size(0);
+    const int spatial_size = value.size(1);
+    const int num_heads = value.size(2);
+    const int channels = value.size(3);
+
+    const int num_levels = spatial_shapes.size(0);
+
+    const int num_query = sampling_loc.size(1);
+    const int num_point = sampling_loc.size(4);
+
+    const int im2col_step_ = std::min(batch, im2col_step);
+
+    AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_);
+
+    auto grad_value = at::zeros_like(value);
+    auto grad_sampling_loc = at::zeros_like(sampling_loc);
+    auto grad_attn_weight = at::zeros_like(attn_weight);
+
+    const int batch_n = im2col_step_;
+    auto per_value_size = spatial_size * num_heads * channels;
+    auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2;
+    auto per_attn_weight_size = num_query * num_heads * num_levels * num_point;
+    auto grad_output_n = grad_output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels});
+    
+    for (int n = 0; n < batch/im2col_step_; ++n)
+    {
+        auto grad_output_g = grad_output_n.select(0, n);
+        AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, value.type(), "ms_deform_attn_backward_cuda", ([&] {
+            ms_deformable_col2im_cuda(at::cuda::getCurrentCUDAStream(),
+                                    grad_output_g.data<scalar_t>(),
+                                    value.data<scalar_t>() + n * im2col_step_ * per_value_size,
+                                    spatial_shapes.data<int64_t>(),
+                                    level_start_index.data<int64_t>(),
+                                    sampling_loc.data<scalar_t>() + n * im2col_step_ * per_sample_loc_size,
+                                    attn_weight.data<scalar_t>() + n * im2col_step_ * per_attn_weight_size,
+                                    batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point,
+                                    grad_value.data<scalar_t>() +  n * im2col_step_ * per_value_size,
+                                    grad_sampling_loc.data<scalar_t>() + n * im2col_step_ * per_sample_loc_size,
+                                    grad_attn_weight.data<scalar_t>() + n * im2col_step_ * per_attn_weight_size);
+
+        }));
+    }
+
+    return {
+        grad_value, grad_sampling_loc, grad_attn_weight
+    };
+}
diff --git a/.venv/lib/python3.11/site-packages/transformers/kernels/deformable_detr/cuda/ms_deform_attn_cuda.cuh b/.venv/lib/python3.11/site-packages/transformers/kernels/deformable_detr/cuda/ms_deform_attn_cuda.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..95385869659b92fe352680543ed43d8a8aadc71c
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/kernels/deformable_detr/cuda/ms_deform_attn_cuda.cuh
@@ -0,0 +1,1467 @@
+/*!
+**************************************************************************************************
+* Deformable DETR
+* Copyright (c) 2020 SenseTime. All Rights Reserved.
+* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+**************************************************************************************************
+* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+**************************************************************************************************
+*/
+
+#include <vector>
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <cstdio>
+#include <algorithm>
+#include <cstring>
+
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+
+#include <THC/THCAtomics.cuh>
+
+#define CUDA_KERNEL_LOOP(i, n)                          \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x;   \
+      i < (n);                                          \
+      i += blockDim.x * gridDim.x)
+
+
+at::Tensor ms_deform_attn_cuda_forward(
+    const at::Tensor &value, 
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const int im2col_step)
+{
+    AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous");
+    AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous");
+    AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous");
+    AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous");
+    AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous");
+
+    AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor");
+    AT_ASSERTM(spatial_shapes.type().is_cuda(), "spatial_shapes must be a CUDA tensor");
+    AT_ASSERTM(level_start_index.type().is_cuda(), "level_start_index must be a CUDA tensor");
+    AT_ASSERTM(sampling_loc.type().is_cuda(), "sampling_loc must be a CUDA tensor");
+    AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor");
+
+    const int batch = value.size(0);
+    const int spatial_size = value.size(1);
+    const int num_heads = value.size(2);
+    const int channels = value.size(3);
+
+    const int num_levels = spatial_shapes.size(0);
+
+    const int num_query = sampling_loc.size(1);
+    const int num_point = sampling_loc.size(4);
+
+    const int im2col_step_ = std::min(batch, im2col_step);
+
+    AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_);
+    
+    auto output = at::zeros({batch, num_query, num_heads, channels}, value.options());
+
+    const int batch_n = im2col_step_;
+    auto output_n = output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels});
+    auto per_value_size = spatial_size * num_heads * channels;
+    auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2;
+    auto per_attn_weight_size = num_query * num_heads * num_levels * num_point;
+    for (int n = 0; n < batch/im2col_step_; ++n)
+    {
+        auto columns = output_n.select(0, n);
+        AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, value.type(), "ms_deform_attn_forward_cuda", ([&] {
+            ms_deformable_im2col_cuda(at::cuda::getCurrentCUDAStream(),
+                value.data<scalar_t>() + n * im2col_step_ * per_value_size,
+                spatial_shapes.data<int64_t>(),
+                level_start_index.data<int64_t>(),
+                sampling_loc.data<scalar_t>() + n * im2col_step_ * per_sample_loc_size,
+                attn_weight.data<scalar_t>() + n * im2col_step_ * per_attn_weight_size,
+                batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point,
+                columns.data<scalar_t>());
+
+        }));
+    }
+
+    output = output.view({batch, num_query, num_heads*channels});
+
+    return output;
+}
+
+
+std::vector<at::Tensor> ms_deform_attn_cuda_backward(
+    const at::Tensor &value, 
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const at::Tensor &grad_output,
+    const int im2col_step)
+{
+
+    AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous");
+    AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous");
+    AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous");
+    AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous");
+    AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous");
+    AT_ASSERTM(grad_output.is_contiguous(), "grad_output tensor has to be contiguous");
+
+    AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor");
+    AT_ASSERTM(spatial_shapes.type().is_cuda(), "spatial_shapes must be a CUDA tensor");
+    AT_ASSERTM(level_start_index.type().is_cuda(), "level_start_index must be a CUDA tensor");
+    AT_ASSERTM(sampling_loc.type().is_cuda(), "sampling_loc must be a CUDA tensor");
+    AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor");
+    AT_ASSERTM(grad_output.type().is_cuda(), "grad_output must be a CUDA tensor");
+
+    const int batch = value.size(0);
+    const int spatial_size = value.size(1);
+    const int num_heads = value.size(2);
+    const int channels = value.size(3);
+
+    const int num_levels = spatial_shapes.size(0);
+
+    const int num_query = sampling_loc.size(1);
+    const int num_point = sampling_loc.size(4);
+
+    const int im2col_step_ = std::min(batch, im2col_step);
+
+    AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_);
+
+    auto grad_value = at::zeros_like(value);
+    auto grad_sampling_loc = at::zeros_like(sampling_loc);
+    auto grad_attn_weight = at::zeros_like(attn_weight);
+
+    const int batch_n = im2col_step_;
+    auto per_value_size = spatial_size * num_heads * channels;
+    auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2;
+    auto per_attn_weight_size = num_query * num_heads * num_levels * num_point;
+    auto grad_output_n = grad_output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels});
+    
+    for (int n = 0; n < batch/im2col_step_; ++n)
+    {
+        auto grad_output_g = grad_output_n.select(0, n);
+        AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, value.type(), "ms_deform_attn_backward_cuda", ([&] {
+            ms_deformable_col2im_cuda(at::cuda::getCurrentCUDAStream(),
+                                    grad_output_g.data<scalar_t>(),
+                                    value.data<scalar_t>() + n * im2col_step_ * per_value_size,
+                                    spatial_shapes.data<int64_t>(),
+                                    level_start_index.data<int64_t>(),
+                                    sampling_loc.data<scalar_t>() + n * im2col_step_ * per_sample_loc_size,
+                                    attn_weight.data<scalar_t>() + n * im2col_step_ * per_attn_weight_size,
+                                    batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point,
+                                    grad_value.data<scalar_t>() +  n * im2col_step_ * per_value_size,
+                                    grad_sampling_loc.data<scalar_t>() + n * im2col_step_ * per_sample_loc_size,
+                                    grad_attn_weight.data<scalar_t>() + n * im2col_step_ * per_attn_weight_size);
+
+        }));
+    }
+
+    return {
+        grad_value, grad_sampling_loc, grad_attn_weight
+    };
+}
+
+const int CUDA_NUM_THREADS = 1024;
+inline int GET_BLOCKS(const int N, const int num_threads)
+{
+  return (N + num_threads - 1) / num_threads;
+}
+
+
+template <typename scalar_t>
+__device__ scalar_t ms_deform_attn_im2col_bilinear(const scalar_t* &bottom_data, 
+                                                   const int &height, const int &width, const int &nheads, const int &channels,
+                                                   const scalar_t &h, const scalar_t &w, const int &m, const int &c)
+{
+  const int h_low = floor(h);
+  const int w_low = floor(w);
+  const int h_high = h_low + 1;
+  const int w_high = w_low + 1;
+
+  const scalar_t lh = h - h_low;
+  const scalar_t lw = w - w_low;
+  const scalar_t hh = 1 - lh, hw = 1 - lw;
+
+  const int w_stride = nheads * channels;
+  const int h_stride = width * w_stride;
+  const int h_low_ptr_offset = h_low * h_stride;
+  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
+  const int w_low_ptr_offset = w_low * w_stride;
+  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
+  const int base_ptr = m * channels + c;
+
+  scalar_t v1 = 0;
+  if (h_low >= 0 && w_low >= 0)
+  {
+    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
+    v1 = bottom_data[ptr1];
+  }
+  scalar_t v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1)
+  {
+    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
+    v2 = bottom_data[ptr2];
+  }
+  scalar_t v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0)
+  {
+    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
+    v3 = bottom_data[ptr3];
+  }
+  scalar_t v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1)
+  {
+    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
+    v4 = bottom_data[ptr4];
+  }
+
+  const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+
+  const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  return val;
+}
+
+
+template <typename scalar_t>
+__device__ void ms_deform_attn_col2im_bilinear(const scalar_t* &bottom_data, 
+                                                   const int &height, const int &width, const int &nheads, const int &channels,
+                                                   const scalar_t &h, const scalar_t &w, const int &m, const int &c,
+                                                   const scalar_t &top_grad,
+                                                   const scalar_t &attn_weight,
+                                                   scalar_t* &grad_value, 
+                                                   scalar_t* grad_sampling_loc,
+                                                   scalar_t* grad_attn_weight)
+{
+  const int h_low = floor(h);
+  const int w_low = floor(w);
+  const int h_high = h_low + 1;
+  const int w_high = w_low + 1;
+
+  const scalar_t lh = h - h_low;
+  const scalar_t lw = w - w_low;
+  const scalar_t hh = 1 - lh, hw = 1 - lw;
+
+  const int w_stride = nheads * channels;
+  const int h_stride = width * w_stride;
+  const int h_low_ptr_offset = h_low * h_stride;
+  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
+  const int w_low_ptr_offset = w_low * w_stride;
+  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
+  const int base_ptr = m * channels + c;
+
+  const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+  const scalar_t top_grad_value = top_grad * attn_weight;
+  scalar_t grad_h_weight = 0, grad_w_weight = 0;
+
+  scalar_t v1 = 0;
+  if (h_low >= 0 && w_low >= 0)
+  {
+    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
+    v1 = bottom_data[ptr1];
+    grad_h_weight -= hw * v1;
+    grad_w_weight -= hh * v1;
+    atomicAdd(grad_value+ptr1, w1*top_grad_value);
+  }
+  scalar_t v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1)
+  {
+    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
+    v2 = bottom_data[ptr2];
+    grad_h_weight -= lw * v2;
+    grad_w_weight += hh * v2;
+    atomicAdd(grad_value+ptr2, w2*top_grad_value);
+  }
+  scalar_t v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0)
+  {
+    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
+    v3 = bottom_data[ptr3];
+    grad_h_weight += hw * v3;
+    grad_w_weight -= lh * v3;
+    atomicAdd(grad_value+ptr3, w3*top_grad_value); 
+  }
+  scalar_t v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1)
+  {
+    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
+    v4 = bottom_data[ptr4];
+    grad_h_weight += lw * v4;
+    grad_w_weight += lh * v4;
+    atomicAdd(grad_value+ptr4, w4*top_grad_value);
+  }
+
+  const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  *grad_attn_weight = top_grad * val;
+  *grad_sampling_loc = width * grad_w_weight * top_grad_value;
+  *(grad_sampling_loc + 1) = height * grad_h_weight * top_grad_value;
+}
+
+
+template <typename scalar_t>
+__device__ void ms_deform_attn_col2im_bilinear_gm(const scalar_t* &bottom_data, 
+                                                   const int &height, const int &width, const int &nheads, const int &channels,
+                                                   const scalar_t &h, const scalar_t &w, const int &m, const int &c,
+                                                   const scalar_t &top_grad,
+                                                   const scalar_t &attn_weight,
+                                                   scalar_t* &grad_value, 
+                                                   scalar_t* grad_sampling_loc,
+                                                   scalar_t* grad_attn_weight)
+{
+  const int h_low = floor(h);
+  const int w_low = floor(w);
+  const int h_high = h_low + 1;
+  const int w_high = w_low + 1;
+
+  const scalar_t lh = h - h_low;
+  const scalar_t lw = w - w_low;
+  const scalar_t hh = 1 - lh, hw = 1 - lw;
+
+  const int w_stride = nheads * channels;
+  const int h_stride = width * w_stride;
+  const int h_low_ptr_offset = h_low * h_stride;
+  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
+  const int w_low_ptr_offset = w_low * w_stride;
+  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
+  const int base_ptr = m * channels + c;
+
+  const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+  const scalar_t top_grad_value = top_grad * attn_weight;
+  scalar_t grad_h_weight = 0, grad_w_weight = 0;
+
+  scalar_t v1 = 0;
+  if (h_low >= 0 && w_low >= 0)
+  {
+    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
+    v1 = bottom_data[ptr1];
+    grad_h_weight -= hw * v1;
+    grad_w_weight -= hh * v1;
+    atomicAdd(grad_value+ptr1, w1*top_grad_value);
+  }
+  scalar_t v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1)
+  {
+    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
+    v2 = bottom_data[ptr2];
+    grad_h_weight -= lw * v2;
+    grad_w_weight += hh * v2;
+    atomicAdd(grad_value+ptr2, w2*top_grad_value);
+  }
+  scalar_t v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0)
+  {
+    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
+    v3 = bottom_data[ptr3];
+    grad_h_weight += hw * v3;
+    grad_w_weight -= lh * v3;
+    atomicAdd(grad_value+ptr3, w3*top_grad_value); 
+  }
+  scalar_t v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1)
+  {
+    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
+    v4 = bottom_data[ptr4];
+    grad_h_weight += lw * v4;
+    grad_w_weight += lh * v4;
+    atomicAdd(grad_value+ptr4, w4*top_grad_value);
+  }
+
+  const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  atomicAdd(grad_attn_weight, top_grad * val); 
+  atomicAdd(grad_sampling_loc, width * grad_w_weight * top_grad_value);
+  atomicAdd(grad_sampling_loc + 1, height * grad_h_weight * top_grad_value);
+}
+
+
+template <typename scalar_t>
+__global__ void ms_deformable_im2col_gpu_kernel(const int n,
+                                                const scalar_t *data_value, 
+                                                const int64_t *data_spatial_shapes,
+                                                const int64_t *data_level_start_index, 
+                                                const scalar_t *data_sampling_loc,
+                                                const scalar_t *data_attn_weight,
+                                                const int batch_size, 
+                                                const int spatial_size, 
+                                                const int num_heads,
+                                                const int channels, 
+                                                const int num_levels,
+                                                const int num_query,
+                                                const int num_point,
+                                                scalar_t *data_col)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp; 
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    const int q_col = _temp % num_query;
+    _temp /= num_query;
+    const int b_col = _temp;
+
+    scalar_t *data_col_ptr = data_col + index;
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+    scalar_t col = 0;
+    
+    for (int l_col=0; l_col < num_levels; ++l_col)
+    {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const scalar_t *data_value_ptr = data_value + (data_value_ptr_init_offset + level_start_id * qid_stride);
+      for (int p_col=0; p_col < num_point; ++p_col)
+      {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
+        {
+          col += ms_deform_attn_im2col_bilinear(data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col) * weight;
+        }
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+      }
+    }
+    *data_col_ptr = col;
+  }
+}
+
+template <typename scalar_t, unsigned int blockSize>
+__global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1(const int n,
+                                                const scalar_t *grad_col,
+                                                const scalar_t *data_value,
+                                                const int64_t *data_spatial_shapes,
+                                                const int64_t *data_level_start_index, 
+                                                const scalar_t *data_sampling_loc,
+                                                const scalar_t *data_attn_weight,
+                                                const int batch_size, 
+                                                const int spatial_size, 
+                                                const int num_heads,
+                                                const int channels, 
+                                                const int num_levels,
+                                                const int num_query,
+                                                const int num_point,
+                                                scalar_t *grad_value,
+                                                scalar_t *grad_sampling_loc,
+                                                scalar_t *grad_attn_weight)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    __shared__ scalar_t cache_grad_sampling_loc[blockSize * 2];
+    __shared__ scalar_t cache_grad_attn_weight[blockSize];
+    unsigned int tid = threadIdx.x;
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp; 
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    const int q_col = _temp % num_query;
+    _temp /= num_query;
+    const int b_col = _temp;
+
+    const scalar_t top_grad = grad_col[index];
+
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+
+    for (int l_col=0; l_col < num_levels; ++l_col)
+    {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
+      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+      for (int p_col=0; p_col < num_point; ++p_col)
+      {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
+        *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
+        *(cache_grad_attn_weight+threadIdx.x)=0;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
+        {
+          ms_deform_attn_col2im_bilinear(
+            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
+            top_grad, weight, grad_value_ptr, 
+            cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
+        }
+        
+        __syncthreads();
+        if (tid == 0)
+        {
+          scalar_t _grad_w=cache_grad_sampling_loc[0], _grad_h=cache_grad_sampling_loc[1], _grad_a=cache_grad_attn_weight[0];
+          int sid=2;
+          for (unsigned int tid = 1; tid < blockSize; ++tid)
+          {
+            _grad_w += cache_grad_sampling_loc[sid];
+            _grad_h += cache_grad_sampling_loc[sid + 1];
+            _grad_a += cache_grad_attn_weight[tid];
+            sid += 2;
+          }
+          
+          
+          *grad_sampling_loc = _grad_w;
+          *(grad_sampling_loc + 1) = _grad_h;
+          *grad_attn_weight = _grad_a;
+        }
+        __syncthreads();
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
+      }
+    }
+  }
+}
+
+
+template <typename scalar_t, unsigned int blockSize>
+__global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2(const int n,
+                                                const scalar_t *grad_col,
+                                                const scalar_t *data_value,
+                                                const int64_t *data_spatial_shapes,
+                                                const int64_t *data_level_start_index, 
+                                                const scalar_t *data_sampling_loc,
+                                                const scalar_t *data_attn_weight,
+                                                const int batch_size, 
+                                                const int spatial_size, 
+                                                const int num_heads,
+                                                const int channels, 
+                                                const int num_levels,
+                                                const int num_query,
+                                                const int num_point,
+                                                scalar_t *grad_value,
+                                                scalar_t *grad_sampling_loc,
+                                                scalar_t *grad_attn_weight)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    __shared__ scalar_t cache_grad_sampling_loc[blockSize * 2];
+    __shared__ scalar_t cache_grad_attn_weight[blockSize];
+    unsigned int tid = threadIdx.x;
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp; 
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    const int q_col = _temp % num_query;
+    _temp /= num_query;
+    const int b_col = _temp;
+
+    const scalar_t top_grad = grad_col[index];
+
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+
+    for (int l_col=0; l_col < num_levels; ++l_col)
+    {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
+      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+      for (int p_col=0; p_col < num_point; ++p_col)
+      {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
+        *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
+        *(cache_grad_attn_weight+threadIdx.x)=0;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
+        {
+          ms_deform_attn_col2im_bilinear(
+            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
+            top_grad, weight, grad_value_ptr, 
+            cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
+        }
+        
+        __syncthreads();
+
+        for (unsigned int s=blockSize/2; s>0; s>>=1)
+        {
+          if (tid < s) {
+            const unsigned int xid1 = tid << 1;
+            const unsigned int xid2 = (tid + s) << 1;
+            cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
+            cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
+            cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1];
+          }
+          __syncthreads();
+        }
+
+        if (tid == 0)
+        { 
+          *grad_sampling_loc = cache_grad_sampling_loc[0];
+          *(grad_sampling_loc + 1) = cache_grad_sampling_loc[1];
+          *grad_attn_weight = cache_grad_attn_weight[0];
+        }
+        __syncthreads();
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
+      }
+    }
+  }
+}
+
+
+template <typename scalar_t>
+__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v1(const int n,
+                                                const scalar_t *grad_col,
+                                                const scalar_t *data_value,
+                                                const int64_t *data_spatial_shapes,
+                                                const int64_t *data_level_start_index, 
+                                                const scalar_t *data_sampling_loc,
+                                                const scalar_t *data_attn_weight,
+                                                const int batch_size, 
+                                                const int spatial_size, 
+                                                const int num_heads,
+                                                const int channels, 
+                                                const int num_levels,
+                                                const int num_query,
+                                                const int num_point,
+                                                scalar_t *grad_value,
+                                                scalar_t *grad_sampling_loc,
+                                                scalar_t *grad_attn_weight)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    extern __shared__ int _s[];
+    scalar_t* cache_grad_sampling_loc = (scalar_t*)_s;
+    scalar_t* cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
+    unsigned int tid = threadIdx.x;
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp; 
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    const int q_col = _temp % num_query;
+    _temp /= num_query;
+    const int b_col = _temp;
+
+    const scalar_t top_grad = grad_col[index];
+
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+
+    for (int l_col=0; l_col < num_levels; ++l_col)
+    {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
+      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+      for (int p_col=0; p_col < num_point; ++p_col)
+      {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
+        *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
+        *(cache_grad_attn_weight+threadIdx.x)=0;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
+        {
+          ms_deform_attn_col2im_bilinear(
+            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
+            top_grad, weight, grad_value_ptr, 
+            cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
+        }
+        
+        __syncthreads();
+        if (tid == 0)
+        {
+          scalar_t _grad_w=cache_grad_sampling_loc[0], _grad_h=cache_grad_sampling_loc[1], _grad_a=cache_grad_attn_weight[0];
+          int sid=2;
+          for (unsigned int tid = 1; tid < blockDim.x; ++tid)
+          {
+            _grad_w += cache_grad_sampling_loc[sid];
+            _grad_h += cache_grad_sampling_loc[sid + 1];
+            _grad_a += cache_grad_attn_weight[tid];
+            sid += 2;
+          }
+          
+          
+          *grad_sampling_loc = _grad_w;
+          *(grad_sampling_loc + 1) = _grad_h;
+          *grad_attn_weight = _grad_a;
+        }
+        __syncthreads();
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
+      }
+    }
+  }
+}
+
+template <typename scalar_t>
+__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2(const int n,
+                                                const scalar_t *grad_col,
+                                                const scalar_t *data_value,
+                                                const int64_t *data_spatial_shapes,
+                                                const int64_t *data_level_start_index, 
+                                                const scalar_t *data_sampling_loc,
+                                                const scalar_t *data_attn_weight,
+                                                const int batch_size, 
+                                                const int spatial_size, 
+                                                const int num_heads,
+                                                const int channels, 
+                                                const int num_levels,
+                                                const int num_query,
+                                                const int num_point,
+                                                scalar_t *grad_value,
+                                                scalar_t *grad_sampling_loc,
+                                                scalar_t *grad_attn_weight)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    extern __shared__ int _s[];
+    scalar_t* cache_grad_sampling_loc = (scalar_t*)_s;
+    scalar_t* cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
+    unsigned int tid = threadIdx.x;
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp; 
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    const int q_col = _temp % num_query;
+    _temp /= num_query;
+    const int b_col = _temp;
+
+    const scalar_t top_grad = grad_col[index];
+
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+
+    for (int l_col=0; l_col < num_levels; ++l_col)
+    {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
+      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+      for (int p_col=0; p_col < num_point; ++p_col)
+      {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
+        *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
+        *(cache_grad_attn_weight+threadIdx.x)=0;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
+        {
+          ms_deform_attn_col2im_bilinear(
+            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
+            top_grad, weight, grad_value_ptr, 
+            cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
+        }
+        
+        __syncthreads();
+
+        for (unsigned int s=blockDim.x/2, spre=blockDim.x; s>0; s>>=1, spre>>=1)
+        {
+          if (tid < s) {
+            const unsigned int xid1 = tid << 1;
+            const unsigned int xid2 = (tid + s) << 1;
+            cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
+            cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
+            cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1];
+            if (tid + (s << 1) < spre)
+            {
+              cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + (s << 1)];
+              cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2 + (s << 1)];
+              cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1 + (s << 1)];
+            } 
+          }
+          __syncthreads();
+        }
+
+        if (tid == 0)
+        {
+          *grad_sampling_loc = cache_grad_sampling_loc[0];
+          *(grad_sampling_loc + 1) = cache_grad_sampling_loc[1];
+          *grad_attn_weight = cache_grad_attn_weight[0];
+        }
+        __syncthreads();
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
+      }
+    }
+  }
+}
+
+template <typename scalar_t>
+__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks(const int n,
+                                                const scalar_t *grad_col,
+                                                const scalar_t *data_value,
+                                                const int64_t *data_spatial_shapes,
+                                                const int64_t *data_level_start_index, 
+                                                const scalar_t *data_sampling_loc,
+                                                const scalar_t *data_attn_weight,
+                                                const int batch_size, 
+                                                const int spatial_size, 
+                                                const int num_heads,
+                                                const int channels, 
+                                                const int num_levels,
+                                                const int num_query,
+                                                const int num_point,
+                                                scalar_t *grad_value,
+                                                scalar_t *grad_sampling_loc,
+                                                scalar_t *grad_attn_weight)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    extern __shared__ int _s[];
+    scalar_t* cache_grad_sampling_loc = (scalar_t*)_s;
+    scalar_t* cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
+    unsigned int tid = threadIdx.x;
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp; 
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    const int q_col = _temp % num_query;
+    _temp /= num_query;
+    const int b_col = _temp;
+
+    const scalar_t top_grad = grad_col[index];
+
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+
+    for (int l_col=0; l_col < num_levels; ++l_col)
+    {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
+      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+      for (int p_col=0; p_col < num_point; ++p_col)
+      {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
+        *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
+        *(cache_grad_attn_weight+threadIdx.x)=0;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
+        {
+          ms_deform_attn_col2im_bilinear(
+            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
+            top_grad, weight, grad_value_ptr, 
+            cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
+        }
+        
+        __syncthreads();
+
+        for (unsigned int s=blockDim.x/2, spre=blockDim.x; s>0; s>>=1, spre>>=1)
+        {
+          if (tid < s) {
+            const unsigned int xid1 = tid << 1;
+            const unsigned int xid2 = (tid + s) << 1;
+            cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
+            cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
+            cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1];
+            if (tid + (s << 1) < spre)
+            {
+              cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + (s << 1)];
+              cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2 + (s << 1)];
+              cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1 + (s << 1)];
+            }
+          }
+          __syncthreads();
+        }
+
+        if (tid == 0)
+        {
+          atomicAdd(grad_sampling_loc, cache_grad_sampling_loc[0]);
+          atomicAdd(grad_sampling_loc + 1, cache_grad_sampling_loc[1]);
+          atomicAdd(grad_attn_weight, cache_grad_attn_weight[0]);
+        }
+        __syncthreads();
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
+      }
+    }
+  }
+}
+
+
+template <typename scalar_t>
+__global__ void ms_deformable_col2im_gpu_kernel_gm(const int n,
+                                                const scalar_t *grad_col,
+                                                const scalar_t *data_value,
+                                                const int64_t *data_spatial_shapes,
+                                                const int64_t *data_level_start_index, 
+                                                const scalar_t *data_sampling_loc,
+                                                const scalar_t *data_attn_weight,
+                                                const int batch_size, 
+                                                const int spatial_size, 
+                                                const int num_heads,
+                                                const int channels, 
+                                                const int num_levels,
+                                                const int num_query,
+                                                const int num_point,
+                                                scalar_t *grad_value,
+                                                scalar_t *grad_sampling_loc,
+                                                scalar_t *grad_attn_weight)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp; 
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    const int q_col = _temp % num_query;
+    _temp /= num_query;
+    const int b_col = _temp;
+
+    const scalar_t top_grad = grad_col[index];
+
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+
+    for (int l_col=0; l_col < num_levels; ++l_col)
+    {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
+      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+      for (int p_col=0; p_col < num_point; ++p_col)
+      {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
+        {
+          ms_deform_attn_col2im_bilinear_gm(
+            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
+            top_grad, weight, grad_value_ptr, 
+            grad_sampling_loc, grad_attn_weight);
+        }
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
+      }
+    }
+  }
+}
+
+
+template <typename scalar_t>
+void ms_deformable_im2col_cuda(cudaStream_t stream,
+                              const scalar_t* data_value,
+                              const int64_t* data_spatial_shapes, 
+                              const int64_t* data_level_start_index, 
+                              const scalar_t* data_sampling_loc,
+                              const scalar_t* data_attn_weight,
+                              const int batch_size,
+                              const int spatial_size, 
+                              const int num_heads, 
+                              const int channels, 
+                              const int num_levels, 
+                              const int num_query,
+                              const int num_point,
+                              scalar_t* data_col)
+{
+  const int num_kernels = batch_size * num_query * num_heads * channels;
+  const int num_actual_kernels = batch_size * num_query * num_heads * channels;
+  const int num_threads = CUDA_NUM_THREADS;
+  ms_deformable_im2col_gpu_kernel<scalar_t>
+      <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+          0, stream>>>(
+      num_kernels, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight, 
+      batch_size, spatial_size, num_heads, channels, num_levels, num_query, num_point, data_col);
+  
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess)
+  {
+    printf("error in ms_deformable_im2col_cuda: %s\n", cudaGetErrorString(err));
+  }
+
+}
+
+template <typename scalar_t>
+void ms_deformable_col2im_cuda(cudaStream_t stream,
+                              const scalar_t* grad_col,
+                              const scalar_t* data_value,
+                              const int64_t * data_spatial_shapes,
+                              const int64_t * data_level_start_index,
+                              const scalar_t * data_sampling_loc,
+                              const scalar_t * data_attn_weight,
+                              const int batch_size, 
+                              const int spatial_size, 
+                              const int num_heads,
+                              const int channels, 
+                              const int num_levels,
+                              const int num_query,
+                              const int num_point, 
+                              scalar_t* grad_value,
+                              scalar_t* grad_sampling_loc,
+                              scalar_t* grad_attn_weight)
+{
+  const int num_threads = (channels > CUDA_NUM_THREADS)?CUDA_NUM_THREADS:channels;
+  const int num_kernels = batch_size * num_query * num_heads * channels;
+  const int num_actual_kernels = batch_size * num_query * num_heads * channels;
+  if (channels > 1024)
+  {
+    if ((channels & 1023) == 0)
+    {
+      ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks<scalar_t>
+          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+              num_threads*3*sizeof(scalar_t), stream>>>(
+                        num_kernels, 
+                        grad_col,
+                        data_value,
+                        data_spatial_shapes,
+                        data_level_start_index, 
+                        data_sampling_loc,
+                        data_attn_weight,
+                        batch_size, 
+                        spatial_size, 
+                        num_heads,
+                        channels, 
+                        num_levels,
+                        num_query,
+                        num_point,
+                        grad_value,
+                        grad_sampling_loc,
+                        grad_attn_weight);
+    }
+    else
+    {
+      ms_deformable_col2im_gpu_kernel_gm<scalar_t>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels, 
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index, 
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size, 
+                      spatial_size, 
+                      num_heads,
+                      channels, 
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+    }
+  }
+  else{
+    switch(channels)
+    {
+      case 1:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 1>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels, 
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index, 
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size, 
+                      spatial_size, 
+                      num_heads,
+                      channels, 
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 2:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 2>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels, 
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index, 
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size, 
+                      spatial_size, 
+                      num_heads,
+                      channels, 
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 4:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 4>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels, 
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index, 
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size, 
+                      spatial_size, 
+                      num_heads,
+                      channels, 
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 8:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 8>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels, 
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index, 
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size, 
+                      spatial_size, 
+                      num_heads,
+                      channels, 
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 16:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 16>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels, 
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index, 
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size, 
+                      spatial_size, 
+                      num_heads,
+                      channels, 
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 32:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 32>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels, 
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index, 
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size, 
+                      spatial_size, 
+                      num_heads,
+                      channels, 
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 64:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 64>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels, 
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index, 
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size, 
+                      spatial_size, 
+                      num_heads,
+                      channels, 
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 128:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 128>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels, 
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index, 
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size, 
+                      spatial_size, 
+                      num_heads,
+                      channels, 
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 256:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 256>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels, 
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index, 
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size, 
+                      spatial_size, 
+                      num_heads,
+                      channels, 
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 512:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 512>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels, 
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index, 
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size, 
+                      spatial_size, 
+                      num_heads,
+                      channels, 
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 1024:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 1024>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels, 
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index, 
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size, 
+                      spatial_size, 
+                      num_heads,
+                      channels, 
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      default:
+        if (channels < 64)
+        {
+          ms_deformable_col2im_gpu_kernel_shm_reduce_v1<scalar_t>
+          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+              num_threads*3*sizeof(scalar_t), stream>>>(
+                        num_kernels, 
+                        grad_col,
+                        data_value,
+                        data_spatial_shapes,
+                        data_level_start_index, 
+                        data_sampling_loc,
+                        data_attn_weight,
+                        batch_size, 
+                        spatial_size, 
+                        num_heads,
+                        channels, 
+                        num_levels,
+                        num_query,
+                        num_point,
+                        grad_value,
+                        grad_sampling_loc,
+                        grad_attn_weight);
+        }
+        else
+        {
+          ms_deformable_col2im_gpu_kernel_shm_reduce_v2<scalar_t>
+          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+              num_threads*3*sizeof(scalar_t), stream>>>(
+                        num_kernels, 
+                        grad_col,
+                        data_value,
+                        data_spatial_shapes,
+                        data_level_start_index, 
+                        data_sampling_loc,
+                        data_attn_weight,
+                        batch_size, 
+                        spatial_size, 
+                        num_heads,
+                        channels, 
+                        num_levels,
+                        num_query,
+                        num_point,
+                        grad_value,
+                        grad_sampling_loc,
+                        grad_attn_weight);
+        }
+    }
+  }
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess)
+  {
+    printf("error in ms_deformable_col2im_cuda: %s\n", cudaGetErrorString(err));
+  }
+
+}
diff --git a/.venv/lib/python3.11/site-packages/transformers/kernels/deformable_detr/cuda/ms_deform_attn_cuda.h b/.venv/lib/python3.11/site-packages/transformers/kernels/deformable_detr/cuda/ms_deform_attn_cuda.h
new file mode 100644
index 0000000000000000000000000000000000000000..d8c21b4e54dcd7071f5fd27f62410326ac069a94
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/kernels/deformable_detr/cuda/ms_deform_attn_cuda.h
@@ -0,0 +1,46 @@
+/*!
+**************************************************************************************************
+* Deformable DETR
+* Copyright (c) 2020 SenseTime. All Rights Reserved.
+* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+**************************************************************************************************
+* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+**************************************************************************************************
+*/
+
+#pragma once
+#include <torch/extension.h>
+
+at::Tensor ms_deform_attn_cuda_forward(
+    const at::Tensor &value, 
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const int im2col_step);
+
+at::Tensor ms_deform_attn_cuda_forward_bf16(
+    const at::Tensor &value,
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const int im2col_step);
+
+std::vector<at::Tensor> ms_deform_attn_cuda_backward(
+    const at::Tensor &value, 
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const at::Tensor &grad_output,
+    const int im2col_step);
+
+std::vector<at::Tensor> ms_deform_attn_cuda_backward_bf16(
+    const at::Tensor &value,
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const at::Tensor &grad_output,
+    const int im2col_step);
diff --git a/.venv/lib/python3.11/site-packages/transformers/kernels/deformable_detr/vision.cpp b/.venv/lib/python3.11/site-packages/transformers/kernels/deformable_detr/vision.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6ce3875568b9ba8d660c90acc805077cca98f891
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/kernels/deformable_detr/vision.cpp
@@ -0,0 +1,16 @@
+/*!
+**************************************************************************************************
+* Deformable DETR
+* Copyright (c) 2020 SenseTime. All Rights Reserved.
+* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+**************************************************************************************************
+* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+**************************************************************************************************
+*/
+
+#include "ms_deform_attn.h"
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("ms_deform_attn_forward", &ms_deform_attn_forward, "ms_deform_attn_forward");
+  m.def("ms_deform_attn_backward", &ms_deform_attn_backward, "ms_deform_attn_backward");
+}
\ No newline at end of file
diff --git a/.venv/lib/python3.11/site-packages/transformers/kernels/falcon_mamba/__init__.py b/.venv/lib/python3.11/site-packages/transformers/kernels/falcon_mamba/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..da88e3394f653369a7443245c67dcbe57f2ed23e
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/kernels/falcon_mamba/__init__.py
@@ -0,0 +1,15 @@
+# coding=utf-8
+# Copyright 2024 Tri Dao, Albert Gu, Technological Innovation Institute and HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .selective_scan_with_ln_interface import mamba_inner_fn
diff --git a/.venv/lib/python3.11/site-packages/transformers/kernels/falcon_mamba/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/kernels/falcon_mamba/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..47840a10bf45cc6a8cf56534d612f3d995ff0653
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/kernels/falcon_mamba/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/kernels/falcon_mamba/selective_scan_with_ln_interface.py b/.venv/lib/python3.11/site-packages/transformers/kernels/falcon_mamba/selective_scan_with_ln_interface.py
new file mode 100644
index 0000000000000000000000000000000000000000..4a74986a81a13f9428eab353de5b61a4d101972d
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/kernels/falcon_mamba/selective_scan_with_ln_interface.py
@@ -0,0 +1,525 @@
+# coding=utf-8
+# Copyright 2024 Tri Dao, Albert Gu, Technological Innovation Institute and HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Original code from: https://github.com/state-spaces/mamba/blob/main/mamba_ssm/ops/selective_scan_interface.py
+
+import torch
+import torch.nn.functional as F
+from einops import rearrange, repeat
+from torch.cuda.amp import custom_bwd, custom_fwd
+
+
+try:
+    import causal_conv1d_cuda
+except ImportError:
+    causal_conv1d_cuda = None
+
+import mamba_ssm
+import selective_scan_cuda
+
+
+# For BC for old mamba-ssm versions: https://github.com/huggingface/transformers/pull/33195#discussion_r1736401127
+if hasattr(mamba_ssm.ops.triton, "layernorm"):
+    from mamba_ssm.ops.triton.layernorm import _layer_norm_fwd
+else:
+    from mamba_ssm.ops.triton.layer_norm import _layer_norm_fwd
+
+
+class SelectiveScanFn(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx, u, delta, A, B, C, D=None, z=None, delta_bias=None, delta_softplus=False, return_last_state=False
+    ):
+        if u.stride(-1) != 1:
+            u = u.contiguous()
+        if delta.stride(-1) != 1:
+            delta = delta.contiguous()
+        if D is not None:
+            D = D.contiguous()
+        if B.stride(-1) != 1:
+            B = B.contiguous()
+        if C.stride(-1) != 1:
+            C = C.contiguous()
+        if z is not None and z.stride(-1) != 1:
+            z = z.contiguous()
+        if B.dim() == 3:
+            B = rearrange(B, "b dstate l -> b 1 dstate l")
+            ctx.squeeze_B = True
+        if C.dim() == 3:
+            C = rearrange(C, "b dstate l -> b 1 dstate l")
+            ctx.squeeze_C = True
+        out, x, *rest = selective_scan_cuda.fwd(u, delta, A, B, C, D, z, delta_bias, delta_softplus)
+        ctx.delta_softplus = delta_softplus
+        ctx.has_z = z is not None
+        last_state = x[:, :, -1, 1::2]  # (batch, dim, dstate)
+        if not ctx.has_z:
+            ctx.save_for_backward(u, delta, A, B, C, D, delta_bias, x)
+            return out if not return_last_state else (out, last_state)
+        else:
+            ctx.save_for_backward(u, delta, A, B, C, D, z, delta_bias, x, out)
+            out_z = rest[0]
+            return out_z if not return_last_state else (out_z, last_state)
+
+    @staticmethod
+    def backward(ctx, dout, *args):
+        if not ctx.has_z:
+            u, delta, A, B, C, D, delta_bias, x = ctx.saved_tensors
+            z = None
+            out = None
+        else:
+            u, delta, A, B, C, D, z, delta_bias, x, out = ctx.saved_tensors
+        if dout.stride(-1) != 1:
+            dout = dout.contiguous()
+        # The kernel supports passing in a pre-allocated dz (e.g., in case we want to fuse the
+        # backward of selective_scan_cuda with the backward of chunk).
+        # Here we just pass in None and dz will be allocated in the C++ code.
+        du, ddelta, dA, dB, dC, dD, ddelta_bias, *rest = selective_scan_cuda.bwd(
+            u,
+            delta,
+            A,
+            B,
+            C,
+            D,
+            z,
+            delta_bias,
+            dout,
+            x,
+            out,
+            None,
+            ctx.delta_softplus,
+            False,  # option to recompute out_z, not used here
+        )
+        dz = rest[0] if ctx.has_z else None
+        dB = dB.squeeze(1) if getattr(ctx, "squeeze_B", False) else dB
+        dC = dC.squeeze(1) if getattr(ctx, "squeeze_C", False) else dC
+        return (
+            du,
+            ddelta,
+            dA,
+            dB,
+            dC,
+            dD if D is not None else None,
+            dz,
+            ddelta_bias if delta_bias is not None else None,
+            None,
+            None,
+        )
+
+
+def rms_norm_forward(
+    x,
+    weight,
+    bias,
+    eps=1e-6,
+    is_rms_norm=True,
+):
+    # x (b l) d
+    if x.stride(-1) != 1:
+        x = x.contiguous()
+    weight = weight.contiguous()
+    if bias is not None:
+        bias = bias.contiguous()
+    y = _layer_norm_fwd(x, weight, bias, eps, None, residual_dtype=None, is_rms_norm=is_rms_norm)[0]
+    # y (b l) d
+    return y
+
+
+def selective_scan_fn(
+    u, delta, A, B, C, D=None, z=None, delta_bias=None, delta_softplus=False, return_last_state=False
+):
+    """if return_last_state is True, returns (out, last_state)
+    last_state has shape (batch, dim, dstate). Note that the gradient of the last state is
+    not considered in the backward pass.
+    """
+    return SelectiveScanFn.apply(u, delta, A, B, C, D, z, delta_bias, delta_softplus, return_last_state)
+
+
+def selective_scan_ref(
+    u, delta, A, B, C, D=None, z=None, delta_bias=None, delta_softplus=False, return_last_state=False
+):
+    """
+    u: r(B D L)
+    delta: r(B D L)
+    A: c(D N) or r(D N)
+    B: c(D N) or r(B N L) or r(B N 2L) or r(B G N L) or (B G N L)
+    C: c(D N) or r(B N L) or r(B N 2L) or r(B G N L) or (B G N L)
+    D: r(D)
+    z: r(B D L)
+    delta_bias: r(D), fp32
+
+    out: r(B D L)
+    last_state (optional): r(B D dstate) or c(B D dstate)
+    """
+    dtype_in = u.dtype
+    u = u.float()
+    delta = delta.float()
+    if delta_bias is not None:
+        delta = delta + delta_bias[..., None].float()
+    if delta_softplus:
+        delta = F.softplus(delta)
+    batch, dim, dstate = u.shape[0], A.shape[0], A.shape[1]
+    is_variable_B = B.dim() >= 3
+    is_variable_C = C.dim() >= 3
+    if A.is_complex():
+        if is_variable_B:
+            B = torch.view_as_complex(rearrange(B.float(), "... (L two) -> ... L two", two=2))
+        if is_variable_C:
+            C = torch.view_as_complex(rearrange(C.float(), "... (L two) -> ... L two", two=2))
+    else:
+        B = B.float()
+        C = C.float()
+    x = A.new_zeros((batch, dim, dstate))
+    ys = []
+    deltaA = torch.exp(torch.einsum("bdl,dn->bdln", delta, A))
+    if not is_variable_B:
+        deltaB_u = torch.einsum("bdl,dn,bdl->bdln", delta, B, u)
+    else:
+        if B.dim() == 3:
+            deltaB_u = torch.einsum("bdl,bnl,bdl->bdln", delta, B, u)
+        else:
+            B = repeat(B, "B G N L -> B (G H) N L", H=dim // B.shape[1])
+            deltaB_u = torch.einsum("bdl,bdnl,bdl->bdln", delta, B, u)
+    if is_variable_C and C.dim() == 4:
+        C = repeat(C, "B G N L -> B (G H) N L", H=dim // C.shape[1])
+    last_state = None
+    for i in range(u.shape[2]):
+        x = deltaA[:, :, i] * x + deltaB_u[:, :, i]
+        if not is_variable_C:
+            y = torch.einsum("bdn,dn->bd", x, C)
+        else:
+            if C.dim() == 3:
+                y = torch.einsum("bdn,bn->bd", x, C[:, :, i])
+            else:
+                y = torch.einsum("bdn,bdn->bd", x, C[:, :, :, i])
+        if i == u.shape[2] - 1:
+            last_state = x
+        if y.is_complex():
+            y = y.real * 2
+        ys.append(y)
+    y = torch.stack(ys, dim=2)  # (batch dim L)
+    out = y if D is None else y + u * rearrange(D, "d -> d 1")
+    if z is not None:
+        out = out * F.silu(z)
+    out = out.to(dtype=dtype_in)
+    return out if not return_last_state else (out, last_state)
+
+
+class MambaInnerFn(torch.autograd.Function):
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx,
+        xz,
+        conv1d_weight,
+        conv1d_bias,
+        x_proj_weight,
+        delta_proj_weight,
+        out_proj_weight,
+        out_proj_bias,
+        A,
+        B=None,
+        C=None,
+        D=None,
+        delta_bias=None,
+        B_proj_bias=None,
+        C_proj_bias=None,
+        delta_softplus=True,
+        checkpoint_lvl=1,
+        b_rms_weight=None,
+        c_rms_weight=None,
+        dt_rms_weight=None,
+        b_c_dt_rms_eps=1e-6,
+    ):
+        """
+        xz: (batch, dim, seqlen)
+        """
+        assert causal_conv1d_cuda is not None, "causal_conv1d_cuda is not available. Please install causal-conv1d."
+        assert checkpoint_lvl in [0, 1]
+        L = xz.shape[-1]
+        delta_rank = delta_proj_weight.shape[1]
+        d_state = A.shape[-1] * (1 if not A.is_complex() else 2)
+        if torch.is_autocast_enabled():
+            x_proj_weight = x_proj_weight.to(dtype=torch.get_autocast_gpu_dtype())
+            delta_proj_weight = delta_proj_weight.to(dtype=torch.get_autocast_gpu_dtype())
+            out_proj_weight = out_proj_weight.to(dtype=torch.get_autocast_gpu_dtype())
+            out_proj_bias = (
+                out_proj_bias.to(dtype=torch.get_autocast_gpu_dtype()) if out_proj_bias is not None else None
+            )
+        if xz.stride(-1) != 1:
+            xz = xz.contiguous()
+        conv1d_weight = rearrange(conv1d_weight, "d 1 w -> d w")
+        x, z = xz.chunk(2, dim=1)
+        conv1d_bias = conv1d_bias.contiguous() if conv1d_bias is not None else None
+        conv1d_out = causal_conv1d_cuda.causal_conv1d_fwd(x, conv1d_weight, conv1d_bias, None, None, None, True)
+        # We're being very careful here about the layout, to avoid extra transposes.
+        # We want delta to have d as the slowest moving dimension
+        # and L as the fastest moving dimension, since those are what the ssm_scan kernel expects.
+        x_dbl = F.linear(rearrange(conv1d_out, "b d l -> (b l) d"), x_proj_weight)  # (bl d)
+        delta = rearrange(delta_proj_weight @ x_dbl[:, :delta_rank].t(), "d (b l) -> b d l", l=L)
+        ctx.is_variable_B = B is None
+        ctx.is_variable_C = C is None
+        ctx.B_proj_bias_is_None = B_proj_bias is None
+        ctx.C_proj_bias_is_None = C_proj_bias is None
+        if B is None:  # variable B
+            B = x_dbl[:, delta_rank : delta_rank + d_state]  # (bl dstate)
+            if B_proj_bias is not None:
+                B = B + B_proj_bias.to(dtype=B.dtype)
+            if not A.is_complex():
+                # B = rearrange(B, "(b l) dstate -> b dstate l", l=L).contiguous()
+                B = rearrange(B, "(b l) dstate -> b 1 dstate l", l=L).contiguous()
+            else:
+                B = rearrange(B, "(b l) (dstate two) -> b 1 dstate (l two)", l=L, two=2).contiguous()
+        else:
+            if B.stride(-1) != 1:
+                B = B.contiguous()
+        if C is None:  # variable C
+            C = x_dbl[:, -d_state:]  # (bl dstate)
+            if C_proj_bias is not None:
+                C = C + C_proj_bias.to(dtype=C.dtype)
+            if not A.is_complex():
+                # C = rearrange(C, "(b l) dstate -> b dstate l", l=L).contiguous()
+                C = rearrange(C, "(b l) dstate -> b 1 dstate l", l=L).contiguous()
+            else:
+                C = rearrange(C, "(b l) (dstate two) -> b 1 dstate (l two)", l=L, two=2).contiguous()
+        else:
+            if C.stride(-1) != 1:
+                C = C.contiguous()
+        if D is not None:
+            D = D.contiguous()
+
+        if b_rms_weight is not None:
+            B = rearrange(B, "b 1 dstate l -> (b l) dstate", l=L).contiguous()
+            B = rms_norm_forward(B, b_rms_weight, bias=None, eps=b_c_dt_rms_eps)
+            B = rearrange(B, "(b l) dstate -> b 1 dstate l", l=L).contiguous()
+        if c_rms_weight is not None:
+            C = rearrange(C, "b 1 dstate l -> (b l) dstate", l=L).contiguous()
+            C = rms_norm_forward(C, c_rms_weight, bias=None, eps=b_c_dt_rms_eps)
+            C = rearrange(C, "(b l) dstate -> b 1 dstate l", l=L).contiguous()
+        if dt_rms_weight is not None:
+            delta = rearrange(delta, "b d l -> (b l) d", l=L).contiguous()
+            delta = rms_norm_forward(delta, dt_rms_weight, bias=None, eps=b_c_dt_rms_eps)
+            delta = rearrange(delta, "(b l) d -> b d l", l=L).contiguous()
+
+        out, scan_intermediates, out_z = selective_scan_cuda.fwd(
+            conv1d_out, delta, A, B, C, D, z, delta_bias, delta_softplus
+        )
+        ctx.delta_softplus = delta_softplus
+        ctx.out_proj_bias_is_None = out_proj_bias is None
+        ctx.checkpoint_lvl = checkpoint_lvl
+        ctx.b_rms_weight = b_rms_weight
+        ctx.c_rms_weight = c_rms_weight
+        ctx.dt_rms_weight = dt_rms_weight
+        ctx.b_c_dt_rms_eps = b_c_dt_rms_eps
+        if checkpoint_lvl >= 1:  # Will recompute conv1d_out and delta in the backward pass
+            conv1d_out, delta = None, None
+        ctx.save_for_backward(
+            xz,
+            conv1d_weight,
+            conv1d_bias,
+            x_dbl,
+            x_proj_weight,
+            delta_proj_weight,
+            out_proj_weight,
+            conv1d_out,
+            delta,
+            A,
+            B,
+            C,
+            D,
+            delta_bias,
+            scan_intermediates,
+            b_rms_weight,
+            c_rms_weight,
+            dt_rms_weight,
+            out,
+        )
+        return F.linear(rearrange(out_z, "b d l -> b l d"), out_proj_weight, out_proj_bias)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, dout):
+        # dout: (batch, seqlen, dim)
+        assert causal_conv1d_cuda is not None, "causal_conv1d_cuda is not available. Please install causal-conv1d."
+        (
+            xz,
+            conv1d_weight,
+            conv1d_bias,
+            x_dbl,
+            x_proj_weight,
+            delta_proj_weight,
+            out_proj_weight,
+            conv1d_out,
+            delta,
+            A,
+            B,
+            C,
+            D,
+            delta_bias,
+            scan_intermediates,
+            b_rms_weight,
+            c_rms_weight,
+            dt_rms_weight,
+            out,
+        ) = ctx.saved_tensors
+        L = xz.shape[-1]
+        delta_rank = delta_proj_weight.shape[1]
+        d_state = A.shape[-1] * (1 if not A.is_complex() else 2)
+        x, z = xz.chunk(2, dim=1)
+        if dout.stride(-1) != 1:
+            dout = dout.contiguous()
+        if ctx.checkpoint_lvl == 1:
+            conv1d_out = causal_conv1d_cuda.causal_conv1d_fwd(x, conv1d_weight, conv1d_bias, None, None, None, True)
+            delta = rearrange(delta_proj_weight @ x_dbl[:, :delta_rank].t(), "d (b l) -> b d l", l=L)
+            if dt_rms_weight is not None:
+                delta = rearrange(delta, "b d l -> (b l) d", l=L).contiguous()
+                delta = rms_norm_forward(delta, ctx.dt_rms_weight, None, ctx.b_c_dt_rms_eps)
+                delta = rearrange(delta, "(b l) d -> b d l", l=L).contiguous()
+            if b_rms_weight is not None:
+                # Recompute & RMSNorm B
+                B = rearrange(B, "b 1 dstate l -> (b l) dstate", l=L).contiguous()
+                B = rms_norm_forward(B, ctx.b_rms_weight, None, ctx.b_c_dt_rms_eps)
+                B = rearrange(B, "(b l) dstate -> b 1 dstate l", l=L).contiguous()
+            if c_rms_weight is not None:
+                # Recompute & RMSNorm C
+                C = rearrange(C, "b 1 dstate l -> (b l) dstate", l=L).contiguous()
+                C = rms_norm_forward(C, ctx.c_rms_weight, None, ctx.b_c_dt_rms_eps)
+                C = rearrange(C, "(b l) dstate -> b 1 dstate l", l=L).contiguous()
+
+        # The kernel supports passing in a pre-allocated dz (e.g., in case we want to fuse the
+        # backward of selective_scan_cuda with the backward of chunk).
+        dxz = torch.empty_like(xz)  # (batch, dim, seqlen)
+        dx, dz = dxz.chunk(2, dim=1)
+        dout = rearrange(dout, "b l e -> e (b l)")
+        dout_y = rearrange(out_proj_weight.t() @ dout, "d (b l) -> b d l", l=L)
+        dconv1d_out, ddelta, dA, dB, dC, dD, ddelta_bias, dz, out_z = selective_scan_cuda.bwd(
+            conv1d_out,
+            delta,
+            A,
+            B,
+            C,
+            D,
+            z,
+            delta_bias,
+            dout_y,
+            scan_intermediates,
+            out,
+            dz,
+            ctx.delta_softplus,
+            True,  # option to recompute out_z
+        )
+        dout_proj_weight = torch.einsum("eB,dB->ed", dout, rearrange(out_z, "b d l -> d (b l)"))
+        dout_proj_bias = dout.sum(dim=(0, 1)) if not ctx.out_proj_bias_is_None else None
+        dD = dD if D is not None else None
+        dx_dbl = torch.empty_like(x_dbl)
+        dB_proj_bias = None
+        if ctx.is_variable_B:
+            if not A.is_complex():
+                dB = rearrange(dB, "b 1 dstate l -> (b l) dstate").contiguous()
+            else:
+                dB = rearrange(dB, "b 1 dstate (l two) -> (b l) (dstate two)", two=2).contiguous()
+            dB_proj_bias = dB.sum(0) if not ctx.B_proj_bias_is_None else None
+            dx_dbl[:, delta_rank : delta_rank + d_state] = dB  # (bl d)
+            dB = None
+        dC_proj_bias = None
+        if ctx.is_variable_C:
+            if not A.is_complex():
+                dC = rearrange(dC, "b 1 dstate l -> (b l) dstate").contiguous()
+            else:
+                dC = rearrange(dC, "b 1 dstate (l two) -> (b l) (dstate two)", two=2).contiguous()
+            dC_proj_bias = dC.sum(0) if not ctx.C_proj_bias_is_None else None
+            dx_dbl[:, -d_state:] = dC  # (bl d)
+            dC = None
+        ddelta = rearrange(ddelta, "b d l -> d (b l)")
+        ddelta_proj_weight = torch.einsum("dB,Br->dr", ddelta, x_dbl[:, :delta_rank])
+        dx_dbl[:, :delta_rank] = torch.einsum("dB,dr->Br", ddelta, delta_proj_weight)
+        dconv1d_out = rearrange(dconv1d_out, "b d l -> d (b l)")
+        dx_proj_weight = torch.einsum("Br,Bd->rd", dx_dbl, rearrange(conv1d_out, "b d l -> (b l) d"))
+        dconv1d_out = torch.addmm(dconv1d_out, x_proj_weight.t(), dx_dbl.t(), out=dconv1d_out)
+        dconv1d_out = rearrange(dconv1d_out, "d (b l) -> b d l", b=x.shape[0], l=x.shape[-1])
+        # The kernel supports passing in a pre-allocated dx (e.g., in case we want to fuse the
+        # backward of conv1d with the backward of chunk).
+        dx, dconv1d_weight, dconv1d_bias, *_ = causal_conv1d_cuda.causal_conv1d_bwd(
+            x, conv1d_weight, conv1d_bias, dconv1d_out, None, None, None, dx, False, True
+        )
+        dconv1d_bias = dconv1d_bias if conv1d_bias is not None else None
+        dconv1d_weight = rearrange(dconv1d_weight, "d w -> d 1 w")
+        return (
+            dxz,
+            dconv1d_weight,
+            dconv1d_bias,
+            dx_proj_weight,
+            ddelta_proj_weight,
+            dout_proj_weight,
+            dout_proj_bias,
+            dA,
+            dB,
+            dC,
+            dD,
+            ddelta_bias if delta_bias is not None else None,
+            # 6-None are delta_softplus, checkpoint_lvl, b_rms_weight, c_rms_weight, dt_rms_weight, b_c_dt_rms_eps
+            dB_proj_bias,
+            dC_proj_bias,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+        )
+
+
+def mamba_inner_fn(
+    xz,
+    conv1d_weight,
+    conv1d_bias,
+    x_proj_weight,
+    delta_proj_weight,
+    out_proj_weight,
+    out_proj_bias,
+    A,
+    B=None,
+    C=None,
+    D=None,
+    delta_bias=None,
+    B_proj_bias=None,
+    C_proj_bias=None,
+    delta_softplus=True,
+    checkpoint_lvl=1,
+    b_rms_weight=None,
+    c_rms_weight=None,
+    dt_rms_weight=None,
+    b_c_dt_rms_eps=1e-6,
+):
+    return MambaInnerFn.apply(
+        xz,
+        conv1d_weight,
+        conv1d_bias,
+        x_proj_weight,
+        delta_proj_weight,
+        out_proj_weight,
+        out_proj_bias,
+        A,
+        B,
+        C,
+        D,
+        delta_bias,
+        B_proj_bias,
+        C_proj_bias,
+        delta_softplus,
+        checkpoint_lvl,
+        b_rms_weight,
+        c_rms_weight,
+        dt_rms_weight,
+        b_c_dt_rms_eps,
+    )
diff --git a/.venv/lib/python3.11/site-packages/transformers/kernels/mra/cuda_kernel.cu b/.venv/lib/python3.11/site-packages/transformers/kernels/mra/cuda_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..87ed89052873813153786bd416a981d3e5279af9
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/kernels/mra/cuda_kernel.cu
@@ -0,0 +1,383 @@
+#include "cuda_kernel.h"
+
+//////////////////////////////////////////////////////////////////////////////////////////////////
+//////////////////////////////////////////////////////////////////////////////////////////////////
+
+__global__ void index_max_cuda_kernel(
+  float *index_vals,       // [batch_size, 32, num_block]
+  int   *indices,        // [batch_size, num_block]
+  float *max_vals,        // [batch_size, A_num_block * 32]
+  float *max_vals_scatter,   // [batch_size, 32, num_block]
+  long batch_size,
+  long A_num_block,
+  long B_num_block,
+  long num_block
+) {
+
+  long batch_idx = blockIdx.x;
+
+  long thread_idx = threadIdx.x;
+  long num_thread = blockDim.x;
+
+  extern __shared__ float buffer[];
+  int *max_buffer = (int*)buffer;
+
+  for (int i = 0; i < A_num_block * 32; i = i + num_thread) {
+    int idx = i + thread_idx;
+    if (idx < A_num_block * 32) {
+      max_buffer[idx] = -1e8;
+    }
+  }
+  __syncthreads();
+
+  int *indices_pt = &indices[batch_idx * num_block];
+  float *index_vals_pt = &index_vals[batch_idx * num_block * 32];
+
+  for (int idx_start = 0; idx_start < 32 * num_block; idx_start = idx_start + num_thread) {
+    int idx = idx_start + thread_idx;
+    int A_block_idx = indices_pt[idx % num_block] / B_num_block;
+    atomicMax(&max_buffer[A_block_idx * 32 + idx / num_block], (int)(index_vals_pt[idx] * 1000));
+  }
+  __syncthreads();
+  
+  float *max_vals_pt = &max_vals[batch_idx * A_num_block * 32];
+  for (int i = 0; i < A_num_block * 32; i = i + num_thread) {
+    int idx = i + thread_idx;
+    if (idx < A_num_block * 32) {
+      max_vals_pt[idx] = (float)max_buffer[idx] / 1000.;
+    }
+  }
+  
+  float *max_vals_scatter_pt = &max_vals_scatter[batch_idx * num_block * 32];
+  for (int idx_start = 0; idx_start < 32 * num_block; idx_start = idx_start + num_thread) {
+    int idx = idx_start + thread_idx;
+    int A_block_idx = indices_pt[idx % num_block] / B_num_block;
+    max_vals_scatter_pt[idx] = (float)max_buffer[A_block_idx * 32 + idx / num_block] / 1000.;
+  }
+
+}
+
+__global__ void mm_to_sparse_cuda_kernel(
+  float *dense_A,   // [batch_size, A_num_block, dim, 32]
+  float *dense_B,   // [batch_size, B_num_block, dim, 32]
+  int   *indices,   // [batch_size, num_block]
+  float *sparse_C,  // [batch_size, num_block, 32, 32]
+  long batch_size,
+  long A_num_block,
+  long B_num_block,
+  long dim,
+  long num_block
+) {
+
+  long batch_idx = blockIdx.y;
+  long block_idx = blockIdx.x * blockDim.y + threadIdx.y;
+
+  long thread_idx = threadIdx.x;
+
+  __shared__ float buffer[4096];
+  float *A_buffer = &buffer[threadIdx.y * 1024]; // [2, 8, 32]
+  float *B_buffer = &buffer[threadIdx.y * 1024 + 512]; // [2, 8, 32]
+
+  long batch_idx__block_idx = batch_idx * num_block + block_idx;
+
+  long AB_block_idx = indices[batch_idx__block_idx];
+  float *dense_A_pt = &dense_A[(batch_idx * A_num_block + AB_block_idx / B_num_block) * dim * 32];
+  float *dense_B_pt = &dense_B[(batch_idx * B_num_block + AB_block_idx % B_num_block) * dim * 32];
+
+  int reg_1_idx = thread_idx / 8;    // [0000000011111111222222223333333344444444555555556666666677777777]
+  int reg_2_idx = thread_idx % 8;    // [0123456701234567012345670123456701234567012345670123456701234567]
+
+  float reg_1[8];
+  float reg_2[8];
+
+  float reg_array[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+
+  #pragma unroll
+  for (int i = 0; i < 4; i++) {
+    A_buffer[i * 64 + thread_idx] = dense_A_pt[i * 64 + thread_idx];
+    B_buffer[i * 64 + thread_idx] = dense_B_pt[i * 64 + thread_idx];
+  }
+
+  __syncthreads();
+
+  #pragma unroll
+  for (int i = 0; i < 4; i++) {
+    reg_1[i] = A_buffer[reg_1_idx * 4 + i];
+    reg_2[i] = B_buffer[reg_2_idx * 4 + i];
+  }
+
+  for (int dim_stride = 1; dim_stride < (dim / 8); dim_stride++) {
+
+    #pragma unroll
+    for (int i = 0; i < 4; i++) {
+      A_buffer[(dim_stride % 2) * 256 + i * 64 + thread_idx] = dense_A_pt[dim_stride * 256 + i * 64 + thread_idx];
+      B_buffer[(dim_stride % 2) * 256 + i * 64 + thread_idx] = dense_B_pt[dim_stride * 256 + i * 64 + thread_idx];
+    }
+
+    #pragma unroll
+    for (int mini_dim_idx = 1; mini_dim_idx < 8; mini_dim_idx++) {
+      #pragma unroll
+      for (int i = 0; i < 4; i++) {
+        reg_1[(mini_dim_idx % 2) * 4 + i] = A_buffer[((dim_stride - 1) % 2) * 256 + mini_dim_idx * 32 + reg_1_idx * 4 + i];
+        reg_2[(mini_dim_idx % 2) * 4 + i] = B_buffer[((dim_stride - 1) % 2) * 256 + mini_dim_idx * 32 + reg_2_idx * 4 + i];
+      }
+      #pragma unroll
+      for (int i = 0; i < 4; i++) {
+        #pragma unroll
+        for (int j = 0; j < 4; j++) {
+          reg_array[i * 4 + j] += reg_1[((mini_dim_idx - 1) % 2) * 4 + i] * reg_2[((mini_dim_idx - 1) % 2) * 4 + j];
+        }
+      }
+    }
+
+    __syncthreads();
+
+    #pragma unroll
+    for (int i = 0; i < 4; i++) {
+      reg_1[i] = A_buffer[(dim_stride % 2) * 256 + reg_1_idx * 4 + i];
+      reg_2[i] = B_buffer[(dim_stride % 2) * 256 + reg_2_idx * 4 + i];
+    }
+
+    #pragma unroll
+    for (int i = 0; i < 4; i++) {
+      #pragma unroll
+      for (int j = 0; j < 4; j++) {
+        reg_array[i * 4 + j] += reg_1[4 + i] * reg_2[4 + j];
+      }
+    }
+
+  }
+
+  #pragma unroll
+  for (int mini_dim_idx = 1; mini_dim_idx < 8; mini_dim_idx++) {
+    #pragma unroll
+    for (int i = 0; i < 4; i++) {
+      reg_1[(mini_dim_idx % 2) * 4 + i] = A_buffer[256 + mini_dim_idx * 32 + reg_1_idx * 4 + i];
+      reg_2[(mini_dim_idx % 2) * 4 + i] = B_buffer[256 + mini_dim_idx * 32 + reg_2_idx * 4 + i];
+    }
+    #pragma unroll
+    for (int i = 0; i < 4; i++) {
+      #pragma unroll
+      for (int j = 0; j < 4; j++) {
+        reg_array[i * 4 + j] += reg_1[((mini_dim_idx - 1) % 2) * 4 + i] * reg_2[((mini_dim_idx - 1) % 2) * 4 + j];
+      }
+    }
+  }
+  #pragma unroll
+  for (int i = 0; i < 4; i++) {
+    #pragma unroll
+    for (int j = 0; j < 4; j++) {
+      reg_array[i * 4 + j] += reg_1[4 + i] * reg_2[4 + j];
+    }
+  }
+  __syncthreads();
+
+  float *C_buffer = &buffer[threadIdx.y * 1024]; // [32, 32]
+
+  #pragma unroll
+  for (int i = 0; i < 4; i++) {
+    #pragma unroll
+    for (int j = 0; j < 4; j++) {
+      C_buffer[(reg_2_idx * 4 + j) * 32 + reg_1_idx * 4 + i] = reg_array[i * 4 + j];
+    }
+  }
+  __syncthreads();
+
+  float *sparse_C_pt = &sparse_C[batch_idx__block_idx * 1024];
+
+  #pragma unroll
+  for (int i = 0; i < 16; i++) {
+    sparse_C_pt[i * 64 + thread_idx] = C_buffer[i * 64 + thread_idx];
+  }
+
+}
+
+__global__ void sparse_dense_mm_cuda_kernel(
+  float *sparse_A,  // [batch_size, num_block, 32, 32]
+  int   *indices,   // [batch_size, num_block]
+  float *dense_B,   // [batch_size, B_num_block, dim, 32]
+  float *dense_C,   // [batch_size, A_num_block, dim, 32]
+  long batch_size,
+  long A_num_block,
+  long B_num_block,
+  long dim,
+  long num_block
+) {
+
+  long batch_idx = blockIdx.y;
+  long block_idx = blockIdx.x * blockDim.y + threadIdx.y;
+
+  long thread_idx = threadIdx.x;
+
+  __shared__ float buffer[6144];
+  float *A_buffer = &buffer[threadIdx.y * 3072]; // [32, 32]
+  float *B_buffer = &buffer[threadIdx.y * 3072 + 1024]; // [32, 64]
+
+  long batch_idx__block_idx = batch_idx * num_block + block_idx;
+
+  float *sparse_A_pt = &sparse_A[batch_idx__block_idx * 1024];
+  #pragma unroll
+  for (int i = 0; i < 8; i++) {
+    A_buffer[i * 128 + thread_idx] = sparse_A_pt[i * 128 + thread_idx];
+  }
+
+  long AB_block_idx = indices[batch_idx__block_idx];
+  float *dense_B_pt = &dense_B[(batch_idx * B_num_block + AB_block_idx % B_num_block) * 32 * dim];
+  float *dense_C_pt = &dense_C[(batch_idx * A_num_block + AB_block_idx / B_num_block) * 32 * dim];
+
+  // [0000000011111111222222223333333344444444555555556666666677777777]
+  // [0123456701234567012345670123456701234567012345670123456701234567]
+  int reg_1_idx = thread_idx / 8;
+  int reg_2_idx = thread_idx % 8;
+
+  float reg_1[8];
+  float reg_2[8];
+
+  float reg_array[16];
+
+  for (int dim_stride = 0; dim_stride < dim; dim_stride = dim_stride + 64) {
+
+    #pragma unroll
+    for (int i = 0; i < 16; i++) {
+      B_buffer[i * 128 + thread_idx] = dense_B_pt[dim_stride * 32 + i * 128 + thread_idx];
+    }
+
+    #pragma unroll
+    for (int i = 0; i < 16; i++) {
+      reg_array[i] = 0;
+    }
+
+    __syncthreads();
+
+    #pragma unroll
+    for (int i = 0; i < 4; i++) {
+      reg_1[i] = B_buffer[(reg_1_idx * 4 + i) * 32];
+      reg_2[i] = A_buffer[reg_2_idx * 4 + i];
+    }
+
+    #pragma unroll
+    for (int mini_dim_idx = 1; mini_dim_idx < 32; mini_dim_idx++) {
+      #pragma unroll
+      for (int i = 0; i < 4; i++) {
+        reg_1[(mini_dim_idx % 2) * 4 + i] = B_buffer[(reg_1_idx * 4 + i) * 32 + mini_dim_idx];
+        reg_2[(mini_dim_idx % 2) * 4 + i] = A_buffer[mini_dim_idx * 32 + reg_2_idx * 4 + i];
+      }
+      #pragma unroll
+      for (int i = 0; i < 4; i++) {
+        #pragma unroll
+        for (int j = 0; j < 4; j++) {
+          reg_array[i * 4 + j] += reg_1[((mini_dim_idx - 1) % 2) * 4 + i] * reg_2[((mini_dim_idx - 1) % 2) * 4 + j];
+        }
+      }
+    }
+
+    #pragma unroll
+    for (int i = 0; i < 4; i++) {
+      #pragma unroll
+      for (int j = 0; j < 4; j++) {
+        reg_array[i * 4 + j] += reg_1[4 + i] * reg_2[4 + j];
+      }
+    }
+
+    __syncthreads();
+
+    float *C_buffer = &buffer[threadIdx.y * 3072 + 1024]; // [64, 32]
+
+    #pragma unroll
+    for (int i = 0; i < 4; i++) {
+      #pragma unroll
+      for (int j = 0; j < 4; j++) {
+        C_buffer[(reg_1_idx * 4 + i) * 32 + reg_2_idx * 4 + j] = reg_array[i * 4 + j];
+      }
+    }
+    __syncthreads();
+
+    #pragma unroll
+    for (int i = 0; i < 16; i++) {
+      atomicAdd(&dense_C_pt[dim_stride * 32 + i * 128 + thread_idx], C_buffer[i * 128 + thread_idx]);
+    }
+    __syncthreads();
+
+  }
+
+}
+
+
+__global__ void reduce_sum_cuda_kernel(
+  float *sparse_A,  // [batch_size, num_block, 32, 32]
+  int   *indices,   // [batch_size, num_block]
+  float *dense_C,   // [batch_size, A_num_block, 32]
+  long batch_size,
+  long A_num_block,
+  long B_num_block,
+  long num_block
+) {
+
+  long batch_idx = blockIdx.y;
+  long block_idx = blockIdx.x * blockDim.y + threadIdx.y;
+
+  long thread_idx = threadIdx.x;
+
+  long batch_idx__block_idx = batch_idx * num_block + block_idx;
+
+  long AB_block_idx = indices[batch_idx__block_idx];
+  float *sparse_A_pt = &sparse_A[batch_idx__block_idx * 1024];
+
+  float reg_array[16];
+  float value = 0;
+
+  #pragma unroll
+  for (int i = 0; i < 8; i++) {
+    reg_array[i] = sparse_A_pt[i * 32 + thread_idx];
+  }
+  #pragma unroll
+  for (int stride = 8; stride < 32; stride = stride + 8) {
+    #pragma unroll
+    for (int i = 0; i < 8; i++) {
+      reg_array[(stride + i) % 16] = sparse_A_pt[(stride + i) * 32 + thread_idx];
+    }
+    #pragma unroll
+    for (int i = 0; i < 8; i++) {
+      value = value + reg_array[(stride - 8 + i) % 16];
+    }
+  }
+  #pragma unroll
+  for (int i = 0; i < 8; i++) {
+    value = value + reg_array[8 + i];
+  }
+
+  float *dense_C_pt = &dense_C[(batch_idx * A_num_block + AB_block_idx / B_num_block) * 32];
+
+  atomicAdd(&dense_C_pt[thread_idx], value);
+
+}
+
+__global__ void scatter_cuda_kernel(
+  float *dense_A,   // [batch_size, A_num_block, 32]
+  int   *indices,   // [batch_size, num_block]
+  float *sparse_C,  // [batch_size, num_block, 32, 32]
+  long batch_size,
+  long A_num_block,
+  long B_num_block,
+  long num_block
+) {
+
+  long batch_idx = blockIdx.y;
+  long block_idx = blockIdx.x * blockDim.y + threadIdx.y;
+
+  long thread_idx = threadIdx.x;
+
+  long batch_idx__block_idx = batch_idx * num_block + block_idx;
+
+  long AB_block_idx = indices[batch_idx__block_idx];
+  float *dense_A_pt = &dense_A[(batch_idx * A_num_block + AB_block_idx / B_num_block) * 32];
+  float *sparse_C_pt = &sparse_C[(batch_idx * num_block + block_idx) * 1024];
+
+  float value = dense_A_pt[thread_idx];
+
+  #pragma unroll
+  for (int i = 0; i < 32; i++) {
+    sparse_C_pt[i * 32 + thread_idx] = value;
+  }
+
+}
diff --git a/.venv/lib/python3.11/site-packages/transformers/kernels/mra/cuda_kernel.h b/.venv/lib/python3.11/site-packages/transformers/kernels/mra/cuda_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..a95b46f7d159b11851143710034cf80c20aa6bf8
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/kernels/mra/cuda_kernel.h
@@ -0,0 +1,59 @@
+
+#define WARP_SIZE 32
+#define FULL_MASK 0xffffffff
+#define OPTIMAL_THREADS 256
+
+__global__ void index_max_cuda_kernel(
+  float *index_vals,       // [batch_size, 32, num_block]
+  int   *indices,        // [batch_size, num_block]
+  float *max_vals,        // [batch_size, A_num_block * 32]
+  float *max_vals_scatter,   // [batch_size, 32, num_block]
+  long batch_size,
+  long A_num_block,
+  long B_num_block,
+  long num_block
+);
+
+__global__ void mm_to_sparse_cuda_kernel(
+  float *dense_A,   // [batch_size, A_num_block, dim, 32]
+  float *dense_B,   // [batch_size, B_num_block, dim, 32]
+  int   *indices,   // [batch_size, num_block]
+  float *sparse_C,  // [batch_size, num_block, 32, 32]
+  long batch_size,
+  long A_num_block,
+  long B_num_block,
+  long dim,
+  long num_block
+);
+
+__global__ void sparse_dense_mm_cuda_kernel(
+  float *sparse_A,  // [batch_size, num_block, 32, 32]
+  int   *indices,   // [batch_size, num_block]
+  float *dense_B,   // [batch_size, B_num_block, dim, 32]
+  float *dense_C,   // [batch_size, A_num_block, dim, 32]
+  long batch_size,
+  long A_num_block,
+  long B_num_block,
+  long dim,
+  long num_block
+);
+
+__global__ void reduce_sum_cuda_kernel(
+  float *sparse_A,  // [batch_size, num_block, 32, 32]
+  int   *indices,   // [batch_size, num_block]
+  float *dense_C,   // [batch_size, A_num_block, 32]
+  long batch_size,
+  long A_num_block,
+  long B_num_block,
+  long num_block
+);
+
+__global__ void scatter_cuda_kernel(
+  float *dense_A,   // [batch_size, A_num_block, 32]
+  int   *indices,   // [batch_size, num_block]
+  float *sparse_C,  // [batch_size, num_block, 32, 32]
+  long batch_size,
+  long A_num_block,
+  long B_num_block,
+  long num_block
+);
diff --git a/.venv/lib/python3.11/site-packages/transformers/kernels/mra/cuda_launch.cu b/.venv/lib/python3.11/site-packages/transformers/kernels/mra/cuda_launch.cu
new file mode 100644
index 0000000000000000000000000000000000000000..ba2a0cacfe614e75e06d2dde80dc77a6e8a4ec1a
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/kernels/mra/cuda_launch.cu
@@ -0,0 +1,154 @@
+#include <torch/extension.h>
+#include <ATen/ATen.h>
+#include "cuda_launch.h"
+#include "cuda_kernel.h"
+#include <vector>
+
+//////////////////////////////////////////////////////////////////////////////////////////////////
+//////////////////////////////////////////////////////////////////////////////////////////////////
+
+std::vector<at::Tensor> index_max_kernel(
+  at::Tensor index_vals,  // [batch_size, 32, num_block]
+  at::Tensor indices,     // [batch_size, num_block],
+  int A_num_block,
+  int B_num_block
+) {
+  int batch_size = indices.size(0);
+  int num_block = indices.size(1);
+
+  at::Tensor max_vals = at::zeros({batch_size, A_num_block * 32}, index_vals.options());
+  at::Tensor max_vals_scatter = at::zeros({batch_size, 32, num_block}, index_vals.options());
+
+  dim3 threads(256);
+  dim3 blocks(batch_size);
+  int shared_mem = A_num_block * 32 * sizeof(float);
+
+  index_max_cuda_kernel<<<blocks, threads, shared_mem>>>(
+    index_vals.data_ptr<float>(),
+    indices.data_ptr<int>(),
+    max_vals.data_ptr<float>(),
+    max_vals_scatter.data_ptr<float>(),
+    batch_size,
+    A_num_block,
+    B_num_block,
+    num_block
+  );
+
+  return {max_vals, max_vals_scatter};
+}
+
+at::Tensor mm_to_sparse_kernel(
+  at::Tensor dense_A,  // [batch_size, A_num_block, dim, 32]
+  at::Tensor dense_B,  // [batch_size, B_num_block, dim, 32]
+  at::Tensor indices   // [batch_size, num_block]
+) {
+  int batch_size = dense_A.size(0);
+  int A_num_block = dense_A.size(1);
+  int B_num_block = dense_B.size(1);
+  int dim = dense_A.size(2);
+  int num_block = indices.size(1);
+
+  at::Tensor sparse_C = at::zeros({batch_size, num_block, 32, 32}, dense_A.options());
+
+  dim3 threads(64, 4);
+  dim3 blocks(num_block / 4, batch_size);
+
+  mm_to_sparse_cuda_kernel<<<blocks, threads>>>(
+    dense_A.data_ptr<float>(),
+    dense_B.data_ptr<float>(),
+    indices.data_ptr<int>(),
+    sparse_C.data_ptr<float>(),
+    batch_size,
+    A_num_block,
+    B_num_block,
+    dim,
+    num_block
+  );
+
+  return sparse_C;
+}
+
+at::Tensor sparse_dense_mm_kernel(
+  at::Tensor sparse_A,  // [batch_size, num_block, 32, 32]
+  at::Tensor indices,   // [batch_size, num_block]
+  at::Tensor dense_B,   // [batch_size, B_num_block, dim, 32]
+  int A_num_block
+) {
+  int batch_size = sparse_A.size(0);
+  int num_block = sparse_A.size(1);
+  int B_num_block = dense_B.size(1);
+  int dim = dense_B.size(2);
+
+  at::Tensor dense_C = at::zeros({batch_size, A_num_block, dim, 32}, dense_B.options());
+
+  dim3 threads(128, 2);
+  dim3 blocks(num_block / 2, batch_size);
+
+  sparse_dense_mm_cuda_kernel<<<blocks, threads>>>(
+    sparse_A.data_ptr<float>(),
+    indices.data_ptr<int>(),
+    dense_B.data_ptr<float>(),
+    dense_C.data_ptr<float>(),
+    batch_size,
+    A_num_block,
+    B_num_block,
+    dim,
+    num_block
+  );
+
+  return dense_C;
+}
+
+at::Tensor reduce_sum_kernel(
+  at::Tensor sparse_A,  // [batch_size, num_block, 32, 32]
+  at::Tensor indices,   // [batch_size, num_block]
+  int A_num_block,
+  int B_num_block
+) {
+  int batch_size = sparse_A.size(0);
+  int num_block = sparse_A.size(1);
+
+  at::Tensor dense_C = at::zeros({batch_size, A_num_block, 32}, sparse_A.options());
+
+  dim3 threads(32, 4);
+  dim3 blocks(num_block / 4, batch_size);
+
+  reduce_sum_cuda_kernel<<<blocks, threads>>>(
+    sparse_A.data_ptr<float>(),
+    indices.data_ptr<int>(),
+    dense_C.data_ptr<float>(),
+    batch_size,
+    A_num_block,
+    B_num_block,
+    num_block
+  );
+
+  return dense_C;
+}
+
+at::Tensor scatter_kernel(
+  at::Tensor dense_A,   // [batch_size, A_num_block, 32]
+  at::Tensor indices,   // [batch_size, num_block]
+  int B_num_block
+) {
+  int batch_size = dense_A.size(0);
+  int A_num_block = dense_A.size(1);
+  int num_block = indices.size(1);
+
+  at::Tensor sparse_C = at::zeros({batch_size, num_block, 32, 32}, dense_A.options());
+
+  dim3 threads(32, 4);
+  dim3 blocks(num_block / 4, batch_size);
+
+  scatter_cuda_kernel<<<blocks, threads>>>(
+    dense_A.data_ptr<float>(),
+    indices.data_ptr<int>(),
+    sparse_C.data_ptr<float>(),
+    batch_size,
+    A_num_block,
+    B_num_block,
+    num_block
+  );
+
+  return sparse_C;
+}
diff --git a/.venv/lib/python3.11/site-packages/transformers/kernels/mra/cuda_launch.h b/.venv/lib/python3.11/site-packages/transformers/kernels/mra/cuda_launch.h
new file mode 100644
index 0000000000000000000000000000000000000000..0200140ee337b8c5d9583767bbad1e842e9d4677
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/kernels/mra/cuda_launch.h
@@ -0,0 +1,39 @@
+#include <torch/extension.h>
+#include <ATen/ATen.h>
+#include <vector>
+
+#define min(a, b) ((a)<(b)?(a):(b))
+#define max(a, b) ((a)>(b)?(a):(b))
+
+std::vector<at::Tensor> index_max_kernel(
+  at::Tensor index_vals,
+  at::Tensor indices,
+  int A_num_block,
+  int B_num_block
+);
+
+at::Tensor mm_to_sparse_kernel(
+  at::Tensor dense_A,
+  at::Tensor dense_B,
+  at::Tensor indices
+);
+
+at::Tensor sparse_dense_mm_kernel(
+  at::Tensor sparse_A,
+  at::Tensor indices,
+  at::Tensor dense_B,
+  int A_num_block
+);
+
+at::Tensor reduce_sum_kernel(
+  at::Tensor sparse_A,
+  at::Tensor indices,
+  int A_num_block,
+  int B_num_block
+);
+
+at::Tensor scatter_kernel(
+  at::Tensor dense_A,
+  at::Tensor indices,
+  int B_num_block
+);
diff --git a/.venv/lib/python3.11/site-packages/transformers/kernels/mra/torch_extension.cpp b/.venv/lib/python3.11/site-packages/transformers/kernels/mra/torch_extension.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..60c9262b779270a6e95ae54f53a67daa6d740a9e
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/kernels/mra/torch_extension.cpp
@@ -0,0 +1,78 @@
+#include <torch/extension.h>
+#include <ATen/ATen.h>
+#include "cuda_launch.h"
+#include <vector>
+
+std::vector<at::Tensor> index_max(
+  at::Tensor index_vals,
+  at::Tensor indices,
+  int A_num_block,
+  int B_num_block
+) {
+  return index_max_kernel(
+    index_vals,
+    indices,
+    A_num_block,
+    B_num_block
+  );
+}
+
+at::Tensor mm_to_sparse(
+  at::Tensor dense_A,
+  at::Tensor dense_B,
+  at::Tensor indices
+) {
+  return mm_to_sparse_kernel(
+    dense_A,
+    dense_B,
+    indices
+  );
+}
+
+at::Tensor sparse_dense_mm(
+  at::Tensor sparse_A,
+  at::Tensor indices,
+  at::Tensor dense_B,
+  int A_num_block
+) {
+  return sparse_dense_mm_kernel(
+    sparse_A,
+    indices,
+    dense_B,
+    A_num_block
+  );
+}
+
+at::Tensor reduce_sum(
+  at::Tensor sparse_A,
+  at::Tensor indices,
+  int A_num_block,
+  int B_num_block
+) {
+  return reduce_sum_kernel(
+    sparse_A,
+    indices,
+    A_num_block,
+    B_num_block
+  );
+}
+
+at::Tensor scatter(
+  at::Tensor dense_A,
+  at::Tensor indices,
+  int B_num_block
+) {
+  return scatter_kernel(
+    dense_A,
+    indices,
+    B_num_block
+  );
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("index_max", &index_max, "index_max (CUDA)");
+  m.def("mm_to_sparse", &mm_to_sparse, "mm_to_sparse (CUDA)");
+  m.def("sparse_dense_mm", &sparse_dense_mm, "sparse_dense_mm (CUDA)");
+  m.def("reduce_sum", &reduce_sum, "reduce_sum (CUDA)");
+  m.def("scatter", &scatter, "scatter (CUDA)");
+}
diff --git a/.venv/lib/python3.11/site-packages/transformers/kernels/yoso/common_cuda.h b/.venv/lib/python3.11/site-packages/transformers/kernels/yoso/common_cuda.h
new file mode 100644
index 0000000000000000000000000000000000000000..97030870649a2fdac58cb26cf966e8f5c8cc7909
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/kernels/yoso/common_cuda.h
@@ -0,0 +1,9 @@
+
+#define MAX_THREADS_PER_BLOCK 1024
+#define OPTIMAL_THREADS_PER_BLOCK 256
+#define WARP_SIZE 32
+#define MAX_NUM_BLOCK_X 2147483647
+#define MAX_NUM_BLOCK_Y 65535
+#define MAX_NUM_BLOCK_Z 65535
+#define MAX_SHARED_MEM_PER_BLOCK 48000
+#define FULL_MASK 0xffffffff
diff --git a/.venv/lib/python3.11/site-packages/transformers/kernels/yoso/common_cuda_device.h b/.venv/lib/python3.11/site-packages/transformers/kernels/yoso/common_cuda_device.h
new file mode 100644
index 0000000000000000000000000000000000000000..6674f93afdc25ab35c5d83881d00028bcf2989fc
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/kernels/yoso/common_cuda_device.h
@@ -0,0 +1,79 @@
+
+#include "common.h"
+
+template<typename T>
+__device__ int set_insert(T *set, int set_size, T value) {
+  int slot = value % set_size;
+  int start_slot = slot;
+  while (true) {
+    T prev = atomicCAS(&set[slot], EMPTY_VALUE, value);
+    if (prev == EMPTY_VALUE || prev == value) {
+      return slot;
+    }
+    slot = (slot + 1) % set_size;
+    if (slot == start_slot) {
+      return -1;
+    }
+  }
+  return -1;
+}
+
+template<typename T>
+__device__ int set_lookup(T *set, int set_size, T value) {
+  int slot = value % set_size;
+  int start_slot = slot;
+  while (true) {
+    if (set[slot] == value) {
+      return slot;
+    }
+    slot = (slot + 1) % set_size;
+    if (slot == start_slot) {
+      return -1;
+    }
+  }
+  return -1;
+}
+
+template<typename T>
+__device__ void init_buffer(T init_value, T *buffer, int buffer_size, int num_threads, int thread_id) {
+  __syncthreads();
+  for (int i = 0; i < buffer_size; i = i + num_threads) {
+    int offset_idx = i + thread_id;
+    if (offset_idx < buffer_size) {
+      buffer[offset_idx] = init_value;
+    }
+  }
+  __syncthreads();
+}
+
+template<typename T>
+__device__ void copy_data(T *src_pt, T *dist_pt, int data_length, int num_threads, int thread_id) {
+  __syncthreads();
+  for (int i = 0; i < data_length; i = i + num_threads) {
+    int offset_idx = i + thread_id;
+    if (offset_idx < data_length) {
+      dist_pt[offset_idx] = src_pt[offset_idx];
+    }
+  }
+  __syncthreads();
+}
+
+template<typename T>
+__device__ void init_buffer_nonblocking(T init_value, T *buffer, int buffer_size, int num_threads, int thread_id) {
+  for (int i = 0; i < buffer_size; i = i + num_threads) {
+    int offset_idx = i + thread_id;
+    if (offset_idx < buffer_size) {
+      buffer[offset_idx] = init_value;
+    }
+  }
+}
+
+template<typename T>
+__device__ void copy_data_nonblocking(T *src_pt, T *dist_pt, int data_length, int num_threads, int thread_id) {
+  for (int i = 0; i < data_length; i = i + num_threads) {
+    int offset_idx = i + thread_id;
+    if (offset_idx < data_length) {
+      dist_pt[offset_idx] = src_pt[offset_idx];
+    }
+  }
+}
diff --git a/.venv/lib/python3.11/site-packages/transformers/loss/__init__.py b/.venv/lib/python3.11/site-packages/transformers/loss/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..196860c9f1c60559826cf6eca845e79834b9c8fd
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/loss/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/.venv/lib/python3.11/site-packages/transformers/loss/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/loss/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1a9c93012da99b9fe5f6946ec9c5a9234d2146df
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/loss/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/loss/__pycache__/loss_deformable_detr.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/loss/__pycache__/loss_deformable_detr.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c147e52a6b931efcbdb9399f951c547f6f8e01b4
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/loss/__pycache__/loss_deformable_detr.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/loss/__pycache__/loss_for_object_detection.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/loss/__pycache__/loss_for_object_detection.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0f11a2ac138b8ff5fbb3ba31cf896e276150a7fe
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/loss/__pycache__/loss_for_object_detection.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/loss/__pycache__/loss_rt_detr.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/loss/__pycache__/loss_rt_detr.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bb01d59569dcb4ac4858944aa901d4e259b4dadc
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/loss/__pycache__/loss_rt_detr.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/loss/__pycache__/loss_utils.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/loss/__pycache__/loss_utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..653cb48b30ebb7f0d13c673f78ec53d20ad06879
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/loss/__pycache__/loss_utils.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/loss/loss_deformable_detr.py b/.venv/lib/python3.11/site-packages/transformers/loss/loss_deformable_detr.py
new file mode 100644
index 0000000000000000000000000000000000000000..62080bcb3fd94fc0fc9051640744a81e03657d4e
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/loss/loss_deformable_detr.py
@@ -0,0 +1,178 @@
+import torch
+import torch.nn as nn
+
+from ..image_transforms import center_to_corners_format
+from ..utils import is_scipy_available
+from .loss_for_object_detection import (
+    HungarianMatcher,
+    ImageLoss,
+    _set_aux_loss,
+    generalized_box_iou,
+    sigmoid_focal_loss,
+)
+
+
+if is_scipy_available():
+    from scipy.optimize import linear_sum_assignment
+
+
+class DeformableDetrHungarianMatcher(HungarianMatcher):
+    @torch.no_grad()
+    def forward(self, outputs, targets):
+        """
+        Differences:
+        - out_prob = outputs["logits"].flatten(0, 1).sigmoid() instead of softmax
+        - class_cost uses alpha and gamma
+        """
+        batch_size, num_queries = outputs["logits"].shape[:2]
+
+        # We flatten to compute the cost matrices in a batch
+        out_prob = outputs["logits"].flatten(0, 1).sigmoid()  # [batch_size * num_queries, num_classes]
+        out_bbox = outputs["pred_boxes"].flatten(0, 1)  # [batch_size * num_queries, 4]
+
+        # Also concat the target labels and boxes
+        target_ids = torch.cat([v["class_labels"] for v in targets])
+        target_bbox = torch.cat([v["boxes"] for v in targets])
+
+        # Compute the classification cost.
+        alpha = 0.25
+        gamma = 2.0
+        neg_cost_class = (1 - alpha) * (out_prob**gamma) * (-(1 - out_prob + 1e-8).log())
+        pos_cost_class = alpha * ((1 - out_prob) ** gamma) * (-(out_prob + 1e-8).log())
+        class_cost = pos_cost_class[:, target_ids] - neg_cost_class[:, target_ids]
+
+        # Compute the L1 cost between boxes
+        bbox_cost = torch.cdist(out_bbox, target_bbox, p=1)
+
+        # Compute the giou cost between boxes
+        giou_cost = -generalized_box_iou(center_to_corners_format(out_bbox), center_to_corners_format(target_bbox))
+
+        # Final cost matrix
+        cost_matrix = self.bbox_cost * bbox_cost + self.class_cost * class_cost + self.giou_cost * giou_cost
+        cost_matrix = cost_matrix.view(batch_size, num_queries, -1).cpu()
+
+        sizes = [len(v["boxes"]) for v in targets]
+        indices = [linear_sum_assignment(c[i]) for i, c in enumerate(cost_matrix.split(sizes, -1))]
+        return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices]
+
+
+class DeformableDetrImageLoss(ImageLoss):
+    def __init__(self, matcher, num_classes, focal_alpha, losses):
+        nn.Module.__init__(self)
+        self.matcher = matcher
+        self.num_classes = num_classes
+        self.focal_alpha = focal_alpha
+        self.losses = losses
+
+    # removed logging parameter, which was part of the original implementation
+    def loss_labels(self, outputs, targets, indices, num_boxes):
+        """
+        Classification loss (Binary focal loss) targets dicts must contain the key "class_labels" containing a tensor
+        of dim [nb_target_boxes]
+        """
+        if "logits" not in outputs:
+            raise KeyError("No logits were found in the outputs")
+        source_logits = outputs["logits"]
+
+        idx = self._get_source_permutation_idx(indices)
+        target_classes_o = torch.cat([t["class_labels"][J] for t, (_, J) in zip(targets, indices)])
+        target_classes = torch.full(
+            source_logits.shape[:2], self.num_classes, dtype=torch.int64, device=source_logits.device
+        )
+        target_classes[idx] = target_classes_o
+
+        target_classes_onehot = torch.zeros(
+            [source_logits.shape[0], source_logits.shape[1], source_logits.shape[2] + 1],
+            dtype=source_logits.dtype,
+            layout=source_logits.layout,
+            device=source_logits.device,
+        )
+        target_classes_onehot.scatter_(2, target_classes.unsqueeze(-1), 1)
+
+        target_classes_onehot = target_classes_onehot[:, :, :-1]
+        loss_ce = (
+            sigmoid_focal_loss(source_logits, target_classes_onehot, num_boxes, alpha=self.focal_alpha, gamma=2)
+            * source_logits.shape[1]
+        )
+        losses = {"loss_ce": loss_ce}
+
+        return losses
+
+
+def DeformableDetrForSegmentationLoss(
+    logits, labels, device, pred_boxes, pred_masks, config, outputs_class=None, outputs_coord=None, **kwargs
+):
+    # First: create the matcher
+    matcher = HungarianMatcher(class_cost=config.class_cost, bbox_cost=config.bbox_cost, giou_cost=config.giou_cost)
+    # Second: create the criterion
+    losses = ["labels", "boxes", "cardinality", "masks"]
+    criterion = DeformableDetrImageLoss(
+        matcher=matcher,
+        num_classes=config.num_labels,
+        focal_alpha=config.focal_alpha,
+        losses=losses,
+    )
+    criterion.to(device)
+    # Third: compute the losses, based on outputs and labels
+    outputs_loss = {}
+    outputs_loss["logits"] = logits
+    outputs_loss["pred_boxes"] = pred_boxes
+    outputs_loss["pred_masks"] = pred_masks
+
+    auxiliary_outputs = None
+    if config.auxiliary_loss:
+        auxiliary_outputs = _set_aux_loss(outputs_class, outputs_coord)
+        outputs_loss["auxiliary_outputs"] = auxiliary_outputs
+
+    loss_dict = criterion(outputs_loss, labels)
+    # Fourth: compute total loss, as a weighted sum of the various losses
+    weight_dict = {"loss_ce": 1, "loss_bbox": config.bbox_loss_coefficient}
+    weight_dict["loss_giou"] = config.giou_loss_coefficient
+    weight_dict["loss_mask"] = config.mask_loss_coefficient
+    weight_dict["loss_dice"] = config.dice_loss_coefficient
+    if config.auxiliary_loss:
+        aux_weight_dict = {}
+        for i in range(config.decoder_layers - 1):
+            aux_weight_dict.update({k + f"_{i}": v for k, v in weight_dict.items()})
+        weight_dict.update(aux_weight_dict)
+
+    loss = sum(loss_dict[k] * weight_dict[k] for k in loss_dict.keys() if k in weight_dict)
+    return loss, loss_dict, auxiliary_outputs
+
+
+def DeformableDetrForObjectDetectionLoss(
+    logits, labels, device, pred_boxes, config, outputs_class=None, outputs_coord=None, **kwargs
+):
+    # First: create the matcher
+    matcher = DeformableDetrHungarianMatcher(
+        class_cost=config.class_cost, bbox_cost=config.bbox_cost, giou_cost=config.giou_cost
+    )
+    # Second: create the criterion
+    losses = ["labels", "boxes", "cardinality"]
+    criterion = DeformableDetrImageLoss(
+        matcher=matcher,
+        num_classes=config.num_labels,
+        focal_alpha=config.focal_alpha,
+        losses=losses,
+    )
+    criterion.to(device)
+    # Third: compute the losses, based on outputs and labels
+    outputs_loss = {}
+    auxiliary_outputs = None
+    outputs_loss["logits"] = logits
+    outputs_loss["pred_boxes"] = pred_boxes
+    if config.auxiliary_loss:
+        auxiliary_outputs = _set_aux_loss(outputs_class, outputs_coord)
+        outputs_loss["auxiliary_outputs"] = auxiliary_outputs
+
+    loss_dict = criterion(outputs_loss, labels)
+    # Fourth: compute total loss, as a weighted sum of the various losses
+    weight_dict = {"loss_ce": 1, "loss_bbox": config.bbox_loss_coefficient}
+    weight_dict["loss_giou"] = config.giou_loss_coefficient
+    if config.auxiliary_loss:
+        aux_weight_dict = {}
+        for i in range(config.decoder_layers - 1):
+            aux_weight_dict.update({k + f"_{i}": v for k, v in weight_dict.items()})
+        weight_dict.update(aux_weight_dict)
+    loss = sum(loss_dict[k] * weight_dict[k] for k in loss_dict.keys() if k in weight_dict)
+    return loss, loss_dict, auxiliary_outputs
diff --git a/.venv/lib/python3.11/site-packages/transformers/loss/loss_for_object_detection.py b/.venv/lib/python3.11/site-packages/transformers/loss/loss_for_object_detection.py
new file mode 100644
index 0000000000000000000000000000000000000000..b820f6daed1224dace31fa30be24ac26094effd2
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/loss/loss_for_object_detection.py
@@ -0,0 +1,562 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import List, Optional
+
+import torch
+import torch.nn as nn
+from torch import Tensor
+
+from ..utils import is_accelerate_available, is_scipy_available, is_vision_available, requires_backends
+
+
+if is_accelerate_available():
+    from accelerate import PartialState
+    from accelerate.utils import reduce
+
+if is_scipy_available():
+    from scipy.optimize import linear_sum_assignment
+
+
+if is_vision_available():
+    from transformers.image_transforms import center_to_corners_format
+
+
+def dice_loss(inputs, targets, num_boxes):
+    """
+    Compute the DICE loss, similar to generalized IOU for masks
+
+    Args:
+        inputs: A float tensor of arbitrary shape.
+                The predictions for each example.
+        targets: A float tensor with the same shape as inputs. Stores the binary
+                 classification label for each element in inputs (0 for the negative class and 1 for the positive
+                 class).
+    """
+    inputs = inputs.sigmoid()
+    inputs = inputs.flatten(1)
+    numerator = 2 * (inputs * targets).sum(1)
+    denominator = inputs.sum(-1) + targets.sum(-1)
+    loss = 1 - (numerator + 1) / (denominator + 1)
+    return loss.sum() / num_boxes
+
+
+def sigmoid_focal_loss(inputs, targets, num_boxes, alpha: float = 0.25, gamma: float = 2):
+    """
+    Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002.
+
+    Args:
+        inputs (`torch.FloatTensor` of arbitrary shape):
+            The predictions for each example.
+        targets (`torch.FloatTensor` with the same shape as `inputs`)
+            A tensor storing the binary classification label for each element in the `inputs` (0 for the negative class
+            and 1 for the positive class).
+        alpha (`float`, *optional*, defaults to `0.25`):
+            Optional weighting factor in the range (0,1) to balance positive vs. negative examples.
+        gamma (`int`, *optional*, defaults to `2`):
+            Exponent of the modulating factor (1 - p_t) to balance easy vs hard examples.
+
+    Returns:
+        Loss tensor
+    """
+    prob = inputs.sigmoid()
+    ce_loss = nn.functional.binary_cross_entropy_with_logits(inputs, targets, reduction="none")
+    # add modulating factor
+    p_t = prob * targets + (1 - prob) * (1 - targets)
+    loss = ce_loss * ((1 - p_t) ** gamma)
+
+    if alpha >= 0:
+        alpha_t = alpha * targets + (1 - alpha) * (1 - targets)
+        loss = alpha_t * loss
+
+    return loss.mean(1).sum() / num_boxes
+
+
+# taken from https://github.com/facebookresearch/detr/blob/master/models/detr.py
+class ImageLoss(nn.Module):
+    """
+    This class computes the losses for DetrForObjectDetection/DetrForSegmentation. The process happens in two steps: 1)
+    we compute hungarian assignment between ground truth boxes and the outputs of the model 2) we supervise each pair
+    of matched ground-truth / prediction (supervise class and box).
+
+    A note on the `num_classes` argument (copied from original repo in detr.py): "the naming of the `num_classes`
+    parameter of the criterion is somewhat misleading. It indeed corresponds to `max_obj_id` + 1, where `max_obj_id` is
+    the maximum id for a class in your dataset. For example, COCO has a `max_obj_id` of 90, so we pass `num_classes` to
+    be 91. As another example, for a dataset that has a single class with `id` 1, you should pass `num_classes` to be 2
+    (`max_obj_id` + 1). For more details on this, check the following discussion
+    https://github.com/facebookresearch/detr/issues/108#issuecomment-650269223"
+
+
+    Args:
+        matcher (`DetrHungarianMatcher`):
+            Module able to compute a matching between targets and proposals.
+        num_classes (`int`):
+            Number of object categories, omitting the special no-object category.
+        eos_coef (`float`):
+            Relative classification weight applied to the no-object category.
+        losses (`List[str]`):
+            List of all the losses to be applied. See `get_loss` for a list of all available losses.
+    """
+
+    def __init__(self, matcher, num_classes, eos_coef, losses):
+        super().__init__()
+        self.matcher = matcher
+        self.num_classes = num_classes
+        self.eos_coef = eos_coef
+        self.losses = losses
+        empty_weight = torch.ones(self.num_classes + 1)
+        empty_weight[-1] = self.eos_coef
+        self.register_buffer("empty_weight", empty_weight)
+
+    # removed logging parameter, which was part of the original implementation
+    def loss_labels(self, outputs, targets, indices, num_boxes):
+        """
+        Classification loss (NLL) targets dicts must contain the key "class_labels" containing a tensor of dim
+        [nb_target_boxes]
+        """
+        if "logits" not in outputs:
+            raise KeyError("No logits were found in the outputs")
+        source_logits = outputs["logits"]
+
+        idx = self._get_source_permutation_idx(indices)
+        target_classes_o = torch.cat([t["class_labels"][J] for t, (_, J) in zip(targets, indices)])
+        target_classes = torch.full(
+            source_logits.shape[:2], self.num_classes, dtype=torch.int64, device=source_logits.device
+        )
+        target_classes[idx] = target_classes_o
+
+        loss_ce = nn.functional.cross_entropy(source_logits.transpose(1, 2), target_classes, self.empty_weight)
+        losses = {"loss_ce": loss_ce}
+
+        return losses
+
+    @torch.no_grad()
+    def loss_cardinality(self, outputs, targets, indices, num_boxes):
+        """
+        Compute the cardinality error, i.e. the absolute error in the number of predicted non-empty boxes.
+
+        This is not really a loss, it is intended for logging purposes only. It doesn't propagate gradients.
+        """
+        logits = outputs["logits"]
+        device = logits.device
+        target_lengths = torch.as_tensor([len(v["class_labels"]) for v in targets], device=device)
+        # Count the number of predictions that are NOT "no-object" (which is the last class)
+        card_pred = (logits.argmax(-1) != logits.shape[-1] - 1).sum(1)
+        card_err = nn.functional.l1_loss(card_pred.float(), target_lengths.float())
+        losses = {"cardinality_error": card_err}
+        return losses
+
+    def loss_boxes(self, outputs, targets, indices, num_boxes):
+        """
+        Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss.
+
+        Targets dicts must contain the key "boxes" containing a tensor of dim [nb_target_boxes, 4]. The target boxes
+        are expected in format (center_x, center_y, w, h), normalized by the image size.
+        """
+        if "pred_boxes" not in outputs:
+            raise KeyError("No predicted boxes found in outputs")
+        idx = self._get_source_permutation_idx(indices)
+        source_boxes = outputs["pred_boxes"][idx]
+        target_boxes = torch.cat([t["boxes"][i] for t, (_, i) in zip(targets, indices)], dim=0)
+
+        loss_bbox = nn.functional.l1_loss(source_boxes, target_boxes, reduction="none")
+
+        losses = {}
+        losses["loss_bbox"] = loss_bbox.sum() / num_boxes
+
+        loss_giou = 1 - torch.diag(
+            generalized_box_iou(center_to_corners_format(source_boxes), center_to_corners_format(target_boxes))
+        )
+        losses["loss_giou"] = loss_giou.sum() / num_boxes
+        return losses
+
+    def loss_masks(self, outputs, targets, indices, num_boxes):
+        """
+        Compute the losses related to the masks: the focal loss and the dice loss.
+
+        Targets dicts must contain the key "masks" containing a tensor of dim [nb_target_boxes, h, w].
+        """
+        if "pred_masks" not in outputs:
+            raise KeyError("No predicted masks found in outputs")
+
+        source_idx = self._get_source_permutation_idx(indices)
+        target_idx = self._get_target_permutation_idx(indices)
+        source_masks = outputs["pred_masks"]
+        source_masks = source_masks[source_idx]
+        masks = [t["masks"] for t in targets]
+        # TODO use valid to mask invalid areas due to padding in loss
+        target_masks, valid = nested_tensor_from_tensor_list(masks).decompose()
+        target_masks = target_masks.to(source_masks)
+        target_masks = target_masks[target_idx]
+
+        # upsample predictions to the target size
+        source_masks = nn.functional.interpolate(
+            source_masks[:, None], size=target_masks.shape[-2:], mode="bilinear", align_corners=False
+        )
+        source_masks = source_masks[:, 0].flatten(1)
+
+        target_masks = target_masks.flatten(1)
+        target_masks = target_masks.view(source_masks.shape)
+        losses = {
+            "loss_mask": sigmoid_focal_loss(source_masks, target_masks, num_boxes),
+            "loss_dice": dice_loss(source_masks, target_masks, num_boxes),
+        }
+        return losses
+
+    def _get_source_permutation_idx(self, indices):
+        # permute predictions following indices
+        batch_idx = torch.cat([torch.full_like(source, i) for i, (source, _) in enumerate(indices)])
+        source_idx = torch.cat([source for (source, _) in indices])
+        return batch_idx, source_idx
+
+    def _get_target_permutation_idx(self, indices):
+        # permute targets following indices
+        batch_idx = torch.cat([torch.full_like(target, i) for i, (_, target) in enumerate(indices)])
+        target_idx = torch.cat([target for (_, target) in indices])
+        return batch_idx, target_idx
+
+    def get_loss(self, loss, outputs, targets, indices, num_boxes):
+        loss_map = {
+            "labels": self.loss_labels,
+            "cardinality": self.loss_cardinality,
+            "boxes": self.loss_boxes,
+            "masks": self.loss_masks,
+        }
+        if loss not in loss_map:
+            raise ValueError(f"Loss {loss} not supported")
+        return loss_map[loss](outputs, targets, indices, num_boxes)
+
+    def forward(self, outputs, targets):
+        """
+        This performs the loss computation.
+
+        Args:
+             outputs (`dict`, *optional*):
+                Dictionary of tensors, see the output specification of the model for the format.
+             targets (`List[dict]`, *optional*):
+                List of dicts, such that `len(targets) == batch_size`. The expected keys in each dict depends on the
+                losses applied, see each loss' doc.
+        """
+        outputs_without_aux = {k: v for k, v in outputs.items() if k != "auxiliary_outputs"}
+
+        # Retrieve the matching between the outputs of the last layer and the targets
+        indices = self.matcher(outputs_without_aux, targets)
+
+        # Compute the average number of target boxes across all nodes, for normalization purposes
+        num_boxes = sum(len(t["class_labels"]) for t in targets)
+        num_boxes = torch.as_tensor([num_boxes], dtype=torch.float, device=next(iter(outputs.values())).device)
+        world_size = 1
+        if is_accelerate_available():
+            if PartialState._shared_state != {}:
+                num_boxes = reduce(num_boxes)
+                world_size = PartialState().num_processes
+        num_boxes = torch.clamp(num_boxes / world_size, min=1).item()
+
+        # Compute all the requested losses
+        losses = {}
+        for loss in self.losses:
+            losses.update(self.get_loss(loss, outputs, targets, indices, num_boxes))
+
+        # In case of auxiliary losses, we repeat this process with the output of each intermediate layer.
+        if "auxiliary_outputs" in outputs:
+            for i, auxiliary_outputs in enumerate(outputs["auxiliary_outputs"]):
+                indices = self.matcher(auxiliary_outputs, targets)
+                for loss in self.losses:
+                    if loss == "masks":
+                        # Intermediate masks losses are too costly to compute, we ignore them.
+                        continue
+                    l_dict = self.get_loss(loss, auxiliary_outputs, targets, indices, num_boxes)
+                    l_dict = {k + f"_{i}": v for k, v in l_dict.items()}
+                    losses.update(l_dict)
+
+        return losses
+
+
+# taken from https://github.com/facebookresearch/detr/blob/master/models/matcher.py
+class HungarianMatcher(nn.Module):
+    """
+    This class computes an assignment between the targets and the predictions of the network.
+
+    For efficiency reasons, the targets don't include the no_object. Because of this, in general, there are more
+    predictions than targets. In this case, we do a 1-to-1 matching of the best predictions, while the others are
+    un-matched (and thus treated as non-objects).
+
+    Args:
+        class_cost:
+            The relative weight of the classification error in the matching cost.
+        bbox_cost:
+            The relative weight of the L1 error of the bounding box coordinates in the matching cost.
+        giou_cost:
+            The relative weight of the giou loss of the bounding box in the matching cost.
+    """
+
+    def __init__(self, class_cost: float = 1, bbox_cost: float = 1, giou_cost: float = 1):
+        super().__init__()
+        requires_backends(self, ["scipy"])
+
+        self.class_cost = class_cost
+        self.bbox_cost = bbox_cost
+        self.giou_cost = giou_cost
+        if class_cost == 0 and bbox_cost == 0 and giou_cost == 0:
+            raise ValueError("All costs of the Matcher can't be 0")
+
+    @torch.no_grad()
+    def forward(self, outputs, targets):
+        """
+        Args:
+            outputs (`dict`):
+                A dictionary that contains at least these entries:
+                * "logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits
+                * "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicted box coordinates.
+            targets (`List[dict]`):
+                A list of targets (len(targets) = batch_size), where each target is a dict containing:
+                * "class_labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of
+                  ground-truth
+                 objects in the target) containing the class labels
+                * "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordinates.
+
+        Returns:
+            `List[Tuple]`: A list of size `batch_size`, containing tuples of (index_i, index_j) where:
+            - index_i is the indices of the selected predictions (in order)
+            - index_j is the indices of the corresponding selected targets (in order)
+            For each batch element, it holds: len(index_i) = len(index_j) = min(num_queries, num_target_boxes)
+        """
+        batch_size, num_queries = outputs["logits"].shape[:2]
+
+        # We flatten to compute the cost matrices in a batch
+        out_prob = outputs["logits"].flatten(0, 1).softmax(-1)  # [batch_size * num_queries, num_classes]
+        out_bbox = outputs["pred_boxes"].flatten(0, 1)  # [batch_size * num_queries, 4]
+
+        # Also concat the target labels and boxes
+        target_ids = torch.cat([v["class_labels"] for v in targets])
+        target_bbox = torch.cat([v["boxes"] for v in targets])
+
+        # Compute the classification cost. Contrary to the loss, we don't use the NLL,
+        # but approximate it in 1 - proba[target class].
+        # The 1 is a constant that doesn't change the matching, it can be ommitted.
+        class_cost = -out_prob[:, target_ids]
+
+        # Compute the L1 cost between boxes
+        bbox_cost = torch.cdist(out_bbox, target_bbox, p=1)
+
+        # Compute the giou cost between boxes
+        giou_cost = -generalized_box_iou(center_to_corners_format(out_bbox), center_to_corners_format(target_bbox))
+
+        # Final cost matrix
+        cost_matrix = self.bbox_cost * bbox_cost + self.class_cost * class_cost + self.giou_cost * giou_cost
+        cost_matrix = cost_matrix.view(batch_size, num_queries, -1).cpu()
+
+        sizes = [len(v["boxes"]) for v in targets]
+        indices = [linear_sum_assignment(c[i]) for i, c in enumerate(cost_matrix.split(sizes, -1))]
+        return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices]
+
+
+# below: bounding box utilities taken from https://github.com/facebookresearch/detr/blob/master/util/box_ops.py
+
+
+def _upcast(t: Tensor) -> Tensor:
+    # Protects from numerical overflows in multiplications by upcasting to the equivalent higher type
+    if t.is_floating_point():
+        return t if t.dtype in (torch.float32, torch.float64) else t.float()
+    else:
+        return t if t.dtype in (torch.int32, torch.int64) else t.int()
+
+
+def box_area(boxes: Tensor) -> Tensor:
+    """
+    Computes the area of a set of bounding boxes, which are specified by its (x1, y1, x2, y2) coordinates.
+
+    Args:
+        boxes (`torch.FloatTensor` of shape `(number_of_boxes, 4)`):
+            Boxes for which the area will be computed. They are expected to be in (x1, y1, x2, y2) format with `0 <= x1
+            < x2` and `0 <= y1 < y2`.
+
+    Returns:
+        `torch.FloatTensor`: a tensor containing the area for each box.
+    """
+    boxes = _upcast(boxes)
+    return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
+
+
+# modified from torchvision to also return the union
+def box_iou(boxes1, boxes2):
+    area1 = box_area(boxes1)
+    area2 = box_area(boxes2)
+
+    left_top = torch.max(boxes1[:, None, :2], boxes2[:, :2])  # [N,M,2]
+    right_bottom = torch.min(boxes1[:, None, 2:], boxes2[:, 2:])  # [N,M,2]
+
+    width_height = (right_bottom - left_top).clamp(min=0)  # [N,M,2]
+    inter = width_height[:, :, 0] * width_height[:, :, 1]  # [N,M]
+
+    union = area1[:, None] + area2 - inter
+
+    iou = inter / union
+    return iou, union
+
+
+def generalized_box_iou(boxes1, boxes2):
+    """
+    Generalized IoU from https://giou.stanford.edu/. The boxes should be in [x0, y0, x1, y1] (corner) format.
+
+    Returns:
+        `torch.FloatTensor`: a [N, M] pairwise matrix, where N = len(boxes1) and M = len(boxes2)
+    """
+    # degenerate boxes gives inf / nan results
+    # so do an early check
+    if not (boxes1[:, 2:] >= boxes1[:, :2]).all():
+        raise ValueError(f"boxes1 must be in [x0, y0, x1, y1] (corner) format, but got {boxes1}")
+    if not (boxes2[:, 2:] >= boxes2[:, :2]).all():
+        raise ValueError(f"boxes2 must be in [x0, y0, x1, y1] (corner) format, but got {boxes2}")
+    iou, union = box_iou(boxes1, boxes2)
+
+    top_left = torch.min(boxes1[:, None, :2], boxes2[:, :2])
+    bottom_right = torch.max(boxes1[:, None, 2:], boxes2[:, 2:])
+
+    width_height = (bottom_right - top_left).clamp(min=0)  # [N,M,2]
+    area = width_height[:, :, 0] * width_height[:, :, 1]
+
+    return iou - (area - union) / area
+
+
+# below: taken from https://github.com/facebookresearch/detr/blob/master/util/misc.py#L306
+def _max_by_axis(the_list):
+    # type: (List[List[int]]) -> List[int]
+    maxes = the_list[0]
+    for sublist in the_list[1:]:
+        for index, item in enumerate(sublist):
+            maxes[index] = max(maxes[index], item)
+    return maxes
+
+
+class NestedTensor:
+    def __init__(self, tensors, mask: Optional[Tensor]):
+        self.tensors = tensors
+        self.mask = mask
+
+    def to(self, device):
+        cast_tensor = self.tensors.to(device)
+        mask = self.mask
+        if mask is not None:
+            cast_mask = mask.to(device)
+        else:
+            cast_mask = None
+        return NestedTensor(cast_tensor, cast_mask)
+
+    def decompose(self):
+        return self.tensors, self.mask
+
+    def __repr__(self):
+        return str(self.tensors)
+
+
+def nested_tensor_from_tensor_list(tensor_list: List[Tensor]):
+    if tensor_list[0].ndim == 3:
+        max_size = _max_by_axis([list(img.shape) for img in tensor_list])
+        batch_shape = [len(tensor_list)] + max_size
+        batch_size, num_channels, height, width = batch_shape
+        dtype = tensor_list[0].dtype
+        device = tensor_list[0].device
+        tensor = torch.zeros(batch_shape, dtype=dtype, device=device)
+        mask = torch.ones((batch_size, height, width), dtype=torch.bool, device=device)
+        for img, pad_img, m in zip(tensor_list, tensor, mask):
+            pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
+            m[: img.shape[1], : img.shape[2]] = False
+    else:
+        raise ValueError("Only 3-dimensional tensors are supported")
+    return NestedTensor(tensor, mask)
+
+
+# taken from https://github.com/facebookresearch/detr/blob/master/models/detr.py
+@torch.jit.unused
+def _set_aux_loss(outputs_class, outputs_coord):
+    # this is a workaround to make torchscript happy, as torchscript
+    # doesn't support dictionary with non-homogeneous values, such
+    # as a dict having both a Tensor and a list.
+    return [{"logits": a, "pred_boxes": b} for a, b in zip(outputs_class[:-1], outputs_coord[:-1])]
+
+
+def ForSegmentationLoss(
+    logits, labels, device, pred_boxes, pred_masks, config, outputs_class=None, outputs_coord=None, **kwargs
+):
+    # First: create the matcher
+    matcher = HungarianMatcher(class_cost=config.class_cost, bbox_cost=config.bbox_cost, giou_cost=config.giou_cost)
+    # Second: create the criterion
+    losses = ["labels", "boxes", "cardinality", "masks"]
+    criterion = ImageLoss(
+        matcher=matcher,
+        num_classes=config.num_labels,
+        eos_coef=config.eos_coefficient,
+        losses=losses,
+    )
+    criterion.to(device)
+    # Third: compute the losses, based on outputs and labels
+    outputs_loss = {}
+    outputs_loss["logits"] = logits
+    outputs_loss["pred_boxes"] = pred_boxes
+    outputs_loss["pred_masks"] = pred_masks
+
+    auxiliary_outputs = None
+    if config.auxiliary_loss:
+        auxiliary_outputs = _set_aux_loss(outputs_class, outputs_coord)
+        outputs_loss["auxiliary_outputs"] = auxiliary_outputs
+
+    loss_dict = criterion(outputs_loss, labels)
+    # Fourth: compute total loss, as a weighted sum of the various losses
+    weight_dict = {"loss_ce": 1, "loss_bbox": config.bbox_loss_coefficient}
+    weight_dict["loss_giou"] = config.giou_loss_coefficient
+    weight_dict["loss_mask"] = config.mask_loss_coefficient
+    weight_dict["loss_dice"] = config.dice_loss_coefficient
+    if config.auxiliary_loss:
+        aux_weight_dict = {}
+        for i in range(config.decoder_layers - 1):
+            aux_weight_dict.update({k + f"_{i}": v for k, v in weight_dict.items()})
+        weight_dict.update(aux_weight_dict)
+    loss = sum(loss_dict[k] * weight_dict[k] for k in loss_dict.keys() if k in weight_dict)
+    return loss, loss_dict, auxiliary_outputs
+
+
+def ForObjectDetectionLoss(
+    logits, labels, device, pred_boxes, config, outputs_class=None, outputs_coord=None, **kwargs
+):
+    # First: create the matcher
+    matcher = HungarianMatcher(class_cost=config.class_cost, bbox_cost=config.bbox_cost, giou_cost=config.giou_cost)
+    # Second: create the criterion
+    losses = ["labels", "boxes", "cardinality"]
+    criterion = ImageLoss(
+        matcher=matcher,
+        num_classes=config.num_labels,
+        eos_coef=config.eos_coefficient,
+        losses=losses,
+    )
+    criterion.to(device)
+    # Third: compute the losses, based on outputs and labels
+    outputs_loss = {}
+    auxiliary_outputs = None
+    outputs_loss["logits"] = logits
+    outputs_loss["pred_boxes"] = pred_boxes
+    if config.auxiliary_loss:
+        auxiliary_outputs = _set_aux_loss(outputs_class, outputs_coord)
+        outputs_loss["auxiliary_outputs"] = auxiliary_outputs
+
+    loss_dict = criterion(outputs_loss, labels)
+    # Fourth: compute total loss, as a weighted sum of the various losses
+    weight_dict = {"loss_ce": 1, "loss_bbox": config.bbox_loss_coefficient}
+    weight_dict["loss_giou"] = config.giou_loss_coefficient
+    if config.auxiliary_loss:
+        aux_weight_dict = {}
+        for i in range(config.decoder_layers - 1):
+            aux_weight_dict.update({k + f"_{i}": v for k, v in weight_dict.items()})
+        weight_dict.update(aux_weight_dict)
+    loss = sum(loss_dict[k] * weight_dict[k] for k in loss_dict.keys() if k in weight_dict)
+    return loss, loss_dict, auxiliary_outputs
diff --git a/.venv/lib/python3.11/site-packages/transformers/loss/loss_rt_detr.py b/.venv/lib/python3.11/site-packages/transformers/loss/loss_rt_detr.py
new file mode 100644
index 0000000000000000000000000000000000000000..3aea87c5f5a75a9aa99dbbfcfc5a50091e22d8c8
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/loss/loss_rt_detr.py
@@ -0,0 +1,463 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from ..utils import is_scipy_available, is_vision_available, requires_backends
+from .loss_for_object_detection import (
+    _set_aux_loss,
+    box_iou,
+    dice_loss,
+    generalized_box_iou,
+    nested_tensor_from_tensor_list,
+    sigmoid_focal_loss,
+)
+
+
+if is_scipy_available():
+    from scipy.optimize import linear_sum_assignment
+
+
+if is_vision_available():
+    from transformers.image_transforms import center_to_corners_format
+
+
+class RTDetrHungarianMatcher(nn.Module):
+    """This class computes an assignment between the targets and the predictions of the network
+
+    For efficiency reasons, the targets don't include the no_object. Because of this, in general, there are more
+    predictions than targets. In this case, we do a 1-to-1 matching of the best predictions, while the others are
+    un-matched (and thus treated as non-objects).
+
+    Args:
+        config: RTDetrConfig
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        requires_backends(self, ["scipy"])
+
+        self.class_cost = config.matcher_class_cost
+        self.bbox_cost = config.matcher_bbox_cost
+        self.giou_cost = config.matcher_giou_cost
+
+        self.use_focal_loss = config.use_focal_loss
+        self.alpha = config.matcher_alpha
+        self.gamma = config.matcher_gamma
+
+        if self.class_cost == self.bbox_cost == self.giou_cost == 0:
+            raise ValueError("All costs of the Matcher can't be 0")
+
+    @torch.no_grad()
+    def forward(self, outputs, targets):
+        """Performs the matching
+
+        Params:
+            outputs: This is a dict that contains at least these entries:
+                 "logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits
+                 "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicted box coordinates
+
+            targets: This is a list of targets (len(targets) = batch_size), where each target is a dict containing:
+                 "class_labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of ground-truth
+                           objects in the target) containing the class labels
+                 "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordinates
+
+        Returns:
+            A list of size batch_size, containing tuples of (index_i, index_j) where:
+                - index_i is the indices of the selected predictions (in order)
+                - index_j is the indices of the corresponding selected targets (in order)
+            For each batch element, it holds:
+                len(index_i) = len(index_j) = min(num_queries, num_target_boxes)
+        """
+        batch_size, num_queries = outputs["logits"].shape[:2]
+
+        # We flatten to compute the cost matrices in a batch
+        out_bbox = outputs["pred_boxes"].flatten(0, 1)  # [batch_size * num_queries, 4]
+        # Also concat the target labels and boxes
+        target_ids = torch.cat([v["class_labels"] for v in targets])
+        target_bbox = torch.cat([v["boxes"] for v in targets])
+        # Compute the classification cost. Contrary to the loss, we don't use the NLL,
+        # but approximate it in 1 - proba[target class].
+        # The 1 is a constant that doesn't change the matching, it can be ommitted.
+        if self.use_focal_loss:
+            out_prob = F.sigmoid(outputs["logits"].flatten(0, 1))
+            out_prob = out_prob[:, target_ids]
+            neg_cost_class = (1 - self.alpha) * (out_prob**self.gamma) * (-(1 - out_prob + 1e-8).log())
+            pos_cost_class = self.alpha * ((1 - out_prob) ** self.gamma) * (-(out_prob + 1e-8).log())
+            class_cost = pos_cost_class - neg_cost_class
+        else:
+            out_prob = outputs["logits"].flatten(0, 1).softmax(-1)  # [batch_size * num_queries, num_classes]
+            class_cost = -out_prob[:, target_ids]
+
+        # Compute the L1 cost between boxes
+        bbox_cost = torch.cdist(out_bbox, target_bbox, p=1)
+        # Compute the giou cost betwen boxes
+        giou_cost = -generalized_box_iou(center_to_corners_format(out_bbox), center_to_corners_format(target_bbox))
+        # Compute the final cost matrix
+        cost_matrix = self.bbox_cost * bbox_cost + self.class_cost * class_cost + self.giou_cost * giou_cost
+        cost_matrix = cost_matrix.view(batch_size, num_queries, -1).cpu()
+
+        sizes = [len(v["boxes"]) for v in targets]
+        indices = [linear_sum_assignment(c[i]) for i, c in enumerate(cost_matrix.split(sizes, -1))]
+
+        return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices]
+
+
+class RTDetrLoss(nn.Module):
+    """
+    This class computes the losses for RTDetr. The process happens in two steps: 1) we compute hungarian assignment
+    between ground truth boxes and the outputs of the model 2) we supervise each pair of matched ground-truth /
+    prediction (supervise class and box).
+
+    Args:
+        matcher (`DetrHungarianMatcher`):
+            Module able to compute a matching between targets and proposals.
+        weight_dict (`Dict`):
+            Dictionary relating each loss with its weights. These losses are configured in RTDetrConf as
+            `weight_loss_vfl`, `weight_loss_bbox`, `weight_loss_giou`
+        losses (`List[str]`):
+            List of all the losses to be applied. See `get_loss` for a list of all available losses.
+        alpha (`float`):
+            Parameter alpha used to compute the focal loss.
+        gamma (`float`):
+            Parameter gamma used to compute the focal loss.
+        eos_coef (`float`):
+            Relative classification weight applied to the no-object category.
+        num_classes (`int`):
+            Number of object categories, omitting the special no-object category.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+
+        self.matcher = RTDetrHungarianMatcher(config)
+        self.num_classes = config.num_labels
+        self.weight_dict = {
+            "loss_vfl": config.weight_loss_vfl,
+            "loss_bbox": config.weight_loss_bbox,
+            "loss_giou": config.weight_loss_giou,
+        }
+        self.losses = ["vfl", "boxes"]
+        self.eos_coef = config.eos_coefficient
+        empty_weight = torch.ones(config.num_labels + 1)
+        empty_weight[-1] = self.eos_coef
+        self.register_buffer("empty_weight", empty_weight)
+        self.alpha = config.focal_loss_alpha
+        self.gamma = config.focal_loss_gamma
+
+    def loss_labels_vfl(self, outputs, targets, indices, num_boxes, log=True):
+        if "pred_boxes" not in outputs:
+            raise KeyError("No predicted boxes found in outputs")
+        if "logits" not in outputs:
+            raise KeyError("No predicted logits found in outputs")
+        idx = self._get_source_permutation_idx(indices)
+
+        src_boxes = outputs["pred_boxes"][idx]
+        target_boxes = torch.cat([_target["boxes"][i] for _target, (_, i) in zip(targets, indices)], dim=0)
+        ious, _ = box_iou(center_to_corners_format(src_boxes), center_to_corners_format(target_boxes))
+        ious = torch.diag(ious).detach()
+
+        src_logits = outputs["logits"]
+        target_classes_original = torch.cat([_target["class_labels"][i] for _target, (_, i) in zip(targets, indices)])
+        target_classes = torch.full(
+            src_logits.shape[:2], self.num_classes, dtype=torch.int64, device=src_logits.device
+        )
+        target_classes[idx] = target_classes_original
+        target = F.one_hot(target_classes, num_classes=self.num_classes + 1)[..., :-1]
+
+        target_score_original = torch.zeros_like(target_classes, dtype=src_logits.dtype)
+        target_score_original[idx] = ious.to(target_score_original.dtype)
+        target_score = target_score_original.unsqueeze(-1) * target
+
+        pred_score = F.sigmoid(src_logits).detach()
+        weight = self.alpha * pred_score.pow(self.gamma) * (1 - target) + target_score
+
+        loss = F.binary_cross_entropy_with_logits(src_logits, target_score, weight=weight, reduction="none")
+        loss = loss.mean(1).sum() * src_logits.shape[1] / num_boxes
+        return {"loss_vfl": loss}
+
+    def loss_labels(self, outputs, targets, indices, num_boxes, log=True):
+        """Classification loss (NLL)
+        targets dicts must contain the key "class_labels" containing a tensor of dim [nb_target_boxes]
+        """
+        if "logits" not in outputs:
+            raise KeyError("No logits were found in the outputs")
+
+        src_logits = outputs["logits"]
+
+        idx = self._get_source_permutation_idx(indices)
+        target_classes_original = torch.cat([_target["class_labels"][i] for _target, (_, i) in zip(targets, indices)])
+        target_classes = torch.full(
+            src_logits.shape[:2], self.num_classes, dtype=torch.int64, device=src_logits.device
+        )
+        target_classes[idx] = target_classes_original
+
+        loss_ce = F.cross_entropy(src_logits.transpose(1, 2), target_classes, self.class_weight)
+        losses = {"loss_ce": loss_ce}
+        return losses
+
+    @torch.no_grad()
+    def loss_cardinality(self, outputs, targets, indices, num_boxes):
+        """
+        Compute the cardinality error, i.e. the absolute error in the number of predicted non-empty boxes. This is not
+        really a loss, it is intended for logging purposes only. It doesn't propagate gradients.
+        """
+        logits = outputs["logits"]
+        device = logits.device
+        target_lengths = torch.as_tensor([len(v["class_labels"]) for v in targets], device=device)
+        # Count the number of predictions that are NOT "no-object" (which is the last class)
+        card_pred = (logits.argmax(-1) != logits.shape[-1] - 1).sum(1)
+        card_err = nn.functional.l1_loss(card_pred.float(), target_lengths.float())
+        losses = {"cardinality_error": card_err}
+        return losses
+
+    def loss_boxes(self, outputs, targets, indices, num_boxes):
+        """
+        Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss. Targets dicts must
+        contain the key "boxes" containing a tensor of dim [nb_target_boxes, 4]. The target boxes are expected in
+        format (center_x, center_y, w, h), normalized by the image size.
+        """
+        if "pred_boxes" not in outputs:
+            raise KeyError("No predicted boxes found in outputs")
+        idx = self._get_source_permutation_idx(indices)
+        src_boxes = outputs["pred_boxes"][idx]
+        target_boxes = torch.cat([t["boxes"][i] for t, (_, i) in zip(targets, indices)], dim=0)
+
+        losses = {}
+
+        loss_bbox = F.l1_loss(src_boxes, target_boxes, reduction="none")
+        losses["loss_bbox"] = loss_bbox.sum() / num_boxes
+
+        loss_giou = 1 - torch.diag(
+            generalized_box_iou(center_to_corners_format(src_boxes), center_to_corners_format(target_boxes))
+        )
+        losses["loss_giou"] = loss_giou.sum() / num_boxes
+        return losses
+
+    def loss_masks(self, outputs, targets, indices, num_boxes):
+        """
+        Compute the losses related to the masks: the focal loss and the dice loss. Targets dicts must contain the key
+        "masks" containing a tensor of dim [nb_target_boxes, h, w].
+        """
+        if "pred_masks" not in outputs:
+            raise KeyError("No predicted masks found in outputs")
+
+        source_idx = self._get_source_permutation_idx(indices)
+        target_idx = self._get_target_permutation_idx(indices)
+        source_masks = outputs["pred_masks"]
+        source_masks = source_masks[source_idx]
+        masks = [t["masks"] for t in targets]
+        target_masks, valid = nested_tensor_from_tensor_list(masks).decompose()
+        target_masks = target_masks.to(source_masks)
+        target_masks = target_masks[target_idx]
+
+        # upsample predictions to the target size
+        source_masks = nn.functional.interpolate(
+            source_masks[:, None], size=target_masks.shape[-2:], mode="bilinear", align_corners=False
+        )
+        source_masks = source_masks[:, 0].flatten(1)
+
+        target_masks = target_masks.flatten(1)
+        target_masks = target_masks.view(source_masks.shape)
+        losses = {
+            "loss_mask": sigmoid_focal_loss(source_masks, target_masks, num_boxes),
+            "loss_dice": dice_loss(source_masks, target_masks, num_boxes),
+        }
+        return losses
+
+    def loss_labels_bce(self, outputs, targets, indices, num_boxes, log=True):
+        src_logits = outputs["logits"]
+        idx = self._get_source_permutation_idx(indices)
+        target_classes_original = torch.cat([_target["class_labels"][i] for _target, (_, i) in zip(targets, indices)])
+        target_classes = torch.full(
+            src_logits.shape[:2], self.num_classes, dtype=torch.int64, device=src_logits.device
+        )
+        target_classes[idx] = target_classes_original
+
+        target = F.one_hot(target_classes, num_classes=self.num_classes + 1)[..., :-1]
+        loss = F.binary_cross_entropy_with_logits(src_logits, target * 1.0, reduction="none")
+        loss = loss.mean(1).sum() * src_logits.shape[1] / num_boxes
+        return {"loss_bce": loss}
+
+    def _get_source_permutation_idx(self, indices):
+        # permute predictions following indices
+        batch_idx = torch.cat([torch.full_like(source, i) for i, (source, _) in enumerate(indices)])
+        source_idx = torch.cat([source for (source, _) in indices])
+        return batch_idx, source_idx
+
+    def _get_target_permutation_idx(self, indices):
+        # permute targets following indices
+        batch_idx = torch.cat([torch.full_like(target, i) for i, (_, target) in enumerate(indices)])
+        target_idx = torch.cat([target for (_, target) in indices])
+        return batch_idx, target_idx
+
+    def loss_labels_focal(self, outputs, targets, indices, num_boxes, log=True):
+        if "logits" not in outputs:
+            raise KeyError("No logits found in outputs")
+
+        src_logits = outputs["logits"]
+
+        idx = self._get_source_permutation_idx(indices)
+        target_classes_original = torch.cat([_target["class_labels"][i] for _target, (_, i) in zip(targets, indices)])
+        target_classes = torch.full(
+            src_logits.shape[:2], self.num_classes, dtype=torch.int64, device=src_logits.device
+        )
+        target_classes[idx] = target_classes_original
+
+        target = F.one_hot(target_classes, num_classes=self.num_classes + 1)[..., :-1]
+        loss = sigmoid_focal_loss(src_logits, target, self.alpha, self.gamma)
+        loss = loss.mean(1).sum() * src_logits.shape[1] / num_boxes
+        return {"loss_focal": loss}
+
+    def get_loss(self, loss, outputs, targets, indices, num_boxes):
+        loss_map = {
+            "labels": self.loss_labels,
+            "cardinality": self.loss_cardinality,
+            "boxes": self.loss_boxes,
+            "masks": self.loss_masks,
+            "bce": self.loss_labels_bce,
+            "focal": self.loss_labels_focal,
+            "vfl": self.loss_labels_vfl,
+        }
+        if loss not in loss_map:
+            raise ValueError(f"Loss {loss} not supported")
+        return loss_map[loss](outputs, targets, indices, num_boxes)
+
+    @staticmethod
+    def get_cdn_matched_indices(dn_meta, targets):
+        dn_positive_idx, dn_num_group = dn_meta["dn_positive_idx"], dn_meta["dn_num_group"]
+        num_gts = [len(t["class_labels"]) for t in targets]
+        device = targets[0]["class_labels"].device
+
+        dn_match_indices = []
+        for i, num_gt in enumerate(num_gts):
+            if num_gt > 0:
+                gt_idx = torch.arange(num_gt, dtype=torch.int64, device=device)
+                gt_idx = gt_idx.tile(dn_num_group)
+                assert len(dn_positive_idx[i]) == len(gt_idx)
+                dn_match_indices.append((dn_positive_idx[i], gt_idx))
+            else:
+                dn_match_indices.append(
+                    (
+                        torch.zeros(0, dtype=torch.int64, device=device),
+                        torch.zeros(0, dtype=torch.int64, device=device),
+                    )
+                )
+
+        return dn_match_indices
+
+    def forward(self, outputs, targets):
+        """
+        This performs the loss computation.
+
+        Args:
+             outputs (`dict`, *optional*):
+                Dictionary of tensors, see the output specification of the model for the format.
+             targets (`List[dict]`, *optional*):
+                List of dicts, such that `len(targets) == batch_size`. The expected keys in each dict depends on the
+                losses applied, see each loss' doc.
+        """
+        outputs_without_aux = {k: v for k, v in outputs.items() if "auxiliary_outputs" not in k}
+
+        # Retrieve the matching between the outputs of the last layer and the targets
+        indices = self.matcher(outputs_without_aux, targets)
+
+        # Compute the average number of target boxes across all nodes, for normalization purposes
+        num_boxes = sum(len(t["class_labels"]) for t in targets)
+        num_boxes = torch.as_tensor([num_boxes], dtype=torch.float, device=next(iter(outputs.values())).device)
+        num_boxes = torch.clamp(num_boxes, min=1).item()
+
+        # Compute all the requested losses
+        losses = {}
+        for loss in self.losses:
+            l_dict = self.get_loss(loss, outputs, targets, indices, num_boxes)
+            l_dict = {k: l_dict[k] * self.weight_dict[k] for k in l_dict if k in self.weight_dict}
+            losses.update(l_dict)
+
+        # In case of auxiliary losses, we repeat this process with the output of each intermediate layer.
+        if "auxiliary_outputs" in outputs:
+            for i, auxiliary_outputs in enumerate(outputs["auxiliary_outputs"]):
+                indices = self.matcher(auxiliary_outputs, targets)
+                for loss in self.losses:
+                    if loss == "masks":
+                        # Intermediate masks losses are too costly to compute, we ignore them.
+                        continue
+                    l_dict = self.get_loss(loss, auxiliary_outputs, targets, indices, num_boxes)
+                    l_dict = {k: l_dict[k] * self.weight_dict[k] for k in l_dict if k in self.weight_dict}
+                    l_dict = {k + f"_aux_{i}": v for k, v in l_dict.items()}
+                    losses.update(l_dict)
+
+        # In case of cdn auxiliary losses. For rtdetr
+        if "dn_auxiliary_outputs" in outputs:
+            if "denoising_meta_values" not in outputs:
+                raise ValueError(
+                    "The output must have the 'denoising_meta_values` key. Please, ensure that 'outputs' includes a 'denoising_meta_values' entry."
+                )
+            indices = self.get_cdn_matched_indices(outputs["denoising_meta_values"], targets)
+            num_boxes = num_boxes * outputs["denoising_meta_values"]["dn_num_group"]
+
+            for i, auxiliary_outputs in enumerate(outputs["dn_auxiliary_outputs"]):
+                # indices = self.matcher(auxiliary_outputs, targets)
+                for loss in self.losses:
+                    if loss == "masks":
+                        # Intermediate masks losses are too costly to compute, we ignore them.
+                        continue
+                    kwargs = {}
+                    l_dict = self.get_loss(loss, auxiliary_outputs, targets, indices, num_boxes, **kwargs)
+                    l_dict = {k: l_dict[k] * self.weight_dict[k] for k in l_dict if k in self.weight_dict}
+                    l_dict = {k + f"_dn_{i}": v for k, v in l_dict.items()}
+                    losses.update(l_dict)
+
+        return losses
+
+
+def RTDetrForObjectDetectionLoss(
+    logits,
+    labels,
+    device,
+    pred_boxes,
+    config,
+    outputs_class=None,
+    outputs_coord=None,
+    enc_topk_logits=None,
+    enc_topk_bboxes=None,
+    denoising_meta_values=None,
+    **kwargs,
+):
+    criterion = RTDetrLoss(config)
+    criterion.to(device)
+    # Second: compute the losses, based on outputs and labels
+    outputs_loss = {}
+    outputs_loss["logits"] = logits
+    outputs_loss["pred_boxes"] = pred_boxes
+    if config.auxiliary_loss:
+        if denoising_meta_values is not None:
+            dn_out_coord, outputs_coord = torch.split(outputs_coord, denoising_meta_values["dn_num_split"], dim=2)
+            dn_out_class, outputs_class = torch.split(outputs_class, denoising_meta_values["dn_num_split"], dim=2)
+
+        auxiliary_outputs = _set_aux_loss(outputs_class[:, :-1].transpose(0, 1), outputs_coord[:, :-1].transpose(0, 1))
+        outputs_loss["auxiliary_outputs"] = auxiliary_outputs
+        outputs_loss["auxiliary_outputs"].extend(_set_aux_loss([enc_topk_logits], [enc_topk_bboxes]))
+        if denoising_meta_values is not None:
+            outputs_loss["dn_auxiliary_outputs"] = _set_aux_loss(
+                dn_out_class.transpose(0, 1), dn_out_coord.transpose(0, 1)
+            )
+            outputs_loss["denoising_meta_values"] = denoising_meta_values
+
+    loss_dict = criterion(outputs_loss, labels)
+
+    loss = sum(loss_dict.values())
+    return loss, loss_dict, auxiliary_outputs
diff --git a/.venv/lib/python3.11/site-packages/transformers/loss/loss_utils.py b/.venv/lib/python3.11/site-packages/transformers/loss/loss_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..2e8e2bb5f149d735da2c363d487e710351291e86
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/loss/loss_utils.py
@@ -0,0 +1,134 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.nn as nn
+from torch.nn import BCEWithLogitsLoss, MSELoss
+
+from .loss_deformable_detr import DeformableDetrForObjectDetectionLoss, DeformableDetrForSegmentationLoss
+from .loss_for_object_detection import ForObjectDetectionLoss, ForSegmentationLoss
+from .loss_rt_detr import RTDetrForObjectDetectionLoss
+
+
+def fixed_cross_entropy(source, target, num_items_in_batch: int = None, ignore_index: int = -100, **kwargs):
+    reduction = "sum" if num_items_in_batch is not None else "mean"
+    loss = nn.functional.cross_entropy(source, target, ignore_index=ignore_index, reduction=reduction)
+    if reduction == "sum":
+        loss = loss / num_items_in_batch
+    return loss
+
+
+def ForCausalLMLoss(
+    logits, labels, vocab_size: int, num_items_in_batch: int = None, ignore_index: int = -100, **kwargs
+):
+    # Upcast to float if we need to compute the loss to avoid potential precision issues
+    logits = logits.float()
+    labels = labels.to(logits.device)
+    # Shift so that tokens < n predict n
+    shift_logits = logits[..., :-1, :].contiguous()
+    shift_labels = labels[..., 1:].contiguous()
+
+    # Flatten the tokens
+    shift_logits = shift_logits.view(-1, vocab_size)
+    shift_labels = shift_labels.view(-1)
+    # Enable model parallelism
+    shift_labels = shift_labels.to(shift_logits.device)
+    loss = fixed_cross_entropy(shift_logits, shift_labels, num_items_in_batch, ignore_index, **kwargs)
+    return loss
+
+
+def ForMaskedLMLoss(
+    logits, labels, vocab_size: int, num_items_in_batch: int = None, ignore_index: int = -100, **kwargs
+):
+    # Upcast to float if we need to compute the loss to avoid potential precision issues
+    logits = logits.float()
+    labels = labels.to(logits.device)
+
+    # Flatten the tokens
+    logits = logits.view(-1, vocab_size)
+    labels = labels.view(-1)
+    # Enable model parallelism
+
+    labels = labels.to(logits.device)
+    loss = fixed_cross_entropy(logits, labels, num_items_in_batch, ignore_index, **kwargs)
+    return loss
+
+
+def ForSequenceClassificationLoss(labels, pooled_logits, config, **kwargs):
+    num_labels = config.num_labels
+    if config.problem_type is None:
+        if num_labels == 1:
+            config.problem_type = "regression"
+        elif num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+            config.problem_type = "single_label_classification"
+        else:
+            config.problem_type = "multi_label_classification"
+
+    labels = labels.to(pooled_logits.device)
+    if config.problem_type == "regression":
+        loss_fct = MSELoss()
+        if num_labels == 1:
+            loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
+        else:
+            loss = loss_fct(pooled_logits, labels)
+    elif config.problem_type == "single_label_classification":
+        loss = fixed_cross_entropy(pooled_logits.view(-1, num_labels), labels.view(-1), **kwargs)
+    elif config.problem_type == "multi_label_classification":
+        loss_fct = BCEWithLogitsLoss()
+        loss = loss_fct(pooled_logits, labels)
+    return loss
+
+
+def ForQuestionAnsweringLoss(start_logits, end_logits, start_positions, end_positions, **kwargs):
+    total_loss = None
+    if start_positions is not None and end_positions is not None:
+        # If we are on multi-GPU, split add a dimension
+        if len(start_positions.size()) > 1:
+            start_positions = start_positions.squeeze(-1).to(start_logits.device)
+        if len(end_positions.size()) > 1:
+            end_positions = end_positions.squeeze(-1).to(end_logits.device)
+        # sometimes the start/end positions are outside our model inputs, we ignore these terms
+        ignored_index = start_logits.size(1)
+        start_positions = start_positions.clamp(0, ignored_index)
+        end_positions = end_positions.clamp(0, ignored_index)
+
+        start_loss = fixed_cross_entropy(start_logits, start_positions, ignore_index=ignored_index, **kwargs)
+        end_loss = fixed_cross_entropy(end_logits, end_positions, ignore_index=ignored_index, **kwargs)
+        total_loss = (start_loss + end_loss) / 2
+    return total_loss
+
+
+def ForTokenClassification(logits, labels, config, **kwargs):
+    # Upcast to float if we need to compute the loss to avoid potential precision issues
+    logits = logits.view(-1, config.num_labels)
+    labels = labels.view(-1).to(logits.device)
+    logits = logits.float()
+    # Flatten the tokens
+    return fixed_cross_entropy(logits, labels, **kwargs)
+
+
+LOSS_MAPPING = {
+    "ForCausalLM": ForCausalLMLoss,
+    "ForMaskedLM": ForMaskedLMLoss,
+    "ForQuestionAnswering": ForQuestionAnsweringLoss,
+    "ForSequenceClassification": ForSequenceClassificationLoss,
+    "ForTokenClassification": ForTokenClassification,
+    "ForSegmentation": ForSegmentationLoss,
+    "ForObjectDetection": ForObjectDetectionLoss,
+    "DeformableDetrForObjectDetection": DeformableDetrForObjectDetectionLoss,
+    "ConditionalDetrForObjectDetection": DeformableDetrForObjectDetectionLoss,
+    "GroundingDinoForObjectDetection": DeformableDetrForObjectDetectionLoss,
+    "ConditionalDetrForSegmentation": DeformableDetrForSegmentationLoss,
+    "RTDetrForObjectDetection": RTDetrForObjectDetectionLoss,
+}