File size: 25,301 Bytes

f43af3c

import copy
from collections import UserDict
from typing import Optional, Union, Dict, Any, List, Mapping

import numpy as np

from easy_tpp.utils import logger, TruncationStrategy, PaddingStrategy, \
    TensorType, is_torch_device, requires_backends, is_numpy_array, py_assert


class BatchEncoding(UserDict):
    """
    Holds the output of the [`~event_tokenizer.EventTokenizer.__call__`],
    [`~event_tokenizer.EventTokenizer.encode_plus`] methods (tokens, attention_masks, etc).

    This class is derived from a python dictionary and can be used as a dictionary.

    Args:
        data (`dict`):
            Dictionary of lists/arrays/tensors returned by the `__call__`/`encode_plus`/`batch_encode_plus` methods
            ('input_ids', 'attention_mask', etc.).
        tensor_type (`Union[None, str, TensorType]`, *optional*):
            You can give a tensor_type here to convert the lists of integers in PyTorch/TensorFlow/Numpy Tensors at
            initialization.
        prepend_batch_axis (`bool`, *optional*, defaults to `False`):
            Whether or not to add a batch axis when converting to tensors (see `tensor_type` above).
        n_sequences (`Optional[int]`, *optional*):
            You can give a tensor_type here to convert the lists of integers in PyTorch/TensorFlow/Numpy Tensors at
            initialization.
    """

    def __init__(
            self,
            data: Optional[Dict[str, Any]] = None,
            tensor_type: Union[None, str, TensorType] = None,
            prepend_batch_axis: bool = False
    ):
        super().__init__(data)

        self.convert_to_tensors(tensor_type=tensor_type, prepend_batch_axis=prepend_batch_axis)

    def keys(self):
        return self.data.keys()

    def values(self):
        return list(self.data.values())

    def items(self):
        return self.data.items()

    def convert_to_tensors(
            self, tensor_type: Optional[Union[str, TensorType]] = None, prepend_batch_axis: bool = False
    ):
        """
        Convert the inner content to tensors.

        Args:
            tensor_type (`str` or [`~utils.TensorType`], *optional*):
                The type of tensors to use. If `str`, should be one of the values of the enum [`~utils.TensorType`]. If
                `None`, no modification is done.
            prepend_batch_axis (`int`, *optional*, defaults to `False`):
                Whether or not to add the batch dimension during the conversion.
        """
        if tensor_type is None:
            return self

        # Convert to TensorType
        if not isinstance(tensor_type, TensorType):
            tensor_type = TensorType(tensor_type)

        # Get a function reference for the correct framework
        if tensor_type == TensorType.PYTORCH:
            import torch

            as_tensor = torch.tensor
            is_tensor = torch.is_tensor
        else:
            as_tensor = np.asarray
            is_tensor = is_numpy_array

        # Do the tensor conversion in batch
        for key, value in self.items():
            try:
                if prepend_batch_axis:
                    value = [value]

                if not is_tensor(value):
                    tensor = as_tensor(value)

                    self[key] = tensor
            except Exception as e:
                if key == "overflowing_tokens":
                    raise ValueError(
                        "Unable to create tensor returning overflowing tokens of different lengths. "
                        "Please see if a fast version of this tokenizer is available to have this feature available."
                    ) from e
                raise ValueError(
                    "Unable to create tensor, you should probably activate truncation and/or padding with"
                    " 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your"
                    f" features (`{key}` in this case) have excessive nesting (inputs type `list` where type `int` is"
                    " expected)."
                ) from e

        return self

    def to(self, device: Union[str, "torch.device"]) -> "BatchEncoding":
        """
        Send all values to device by calling `v.to(device)` (PyTorch only).

        Args:
            device (`str` or `torch.device`): The device to put the tensors on.

        Returns:
            [`BatchEncoding`]: The same instance after modification.
        """
        requires_backends(self, ["torch"])

        # This check catches things like APEX blindly calling "to" on all inputs to a module
        # Otherwise it passes the casts down and casts the LongTensor containing the token idxs
        # into a HalfTensor
        if isinstance(device, str) or is_torch_device(device) or isinstance(device, int):
            self.data = {k: v.to(device=device) for k, v in self.data.items()}
        else:
            logger.warning(f"Attempting to cast a BatchEncoding to type {str(device)}. This is not supported.")
        return self


class EventTokenizer:
    """
    Base class for tokenizer event sequences, vendored from huggingface/transformer
    """
    padding_side: str = "right"
    truncation_side: str = "right"
    model_input_names: List[str] = ["time_seqs", "time_delta_seqs", "type_seqs", "seq_non_pad_mask", "attention_mask"]

    def __init__(self, config):
        config = copy.deepcopy(config)
        self.num_event_types = config.num_event_types
        self.pad_token_id = config.pad_token_id

        self.model_max_length = config.max_len

        self.padding_strategy = config.padding_strategy
        self.truncation_strategy = config.truncation_strategy

        # Padding and truncation side are right by default and overridden in subclasses. If specified in the kwargs, it
        # is changed.
        self.padding_side = config.pop("padding_side", self.padding_side)
        self.truncation_side = config.pop("truncation_side", self.truncation_side)
        self.model_input_names = config.pop("model_input_names", self.model_input_names)

    def _get_padding_truncation_strategies(
            self, padding=False, truncation=None, max_length=None, verbose=False, **kwargs
    ):
        padding_strategy, truncation_strategy = None, None
        # If you only set max_length, it activates truncation for max_length
        if max_length is not None and padding is False and truncation is None:
            if verbose:
                logger.warning(
                    "Truncation was not explicitly activated but `max_length` is provided a specific value, please"
                    " use `truncation=True` to explicitly truncate examples to max length. Defaulting to"
                    " 'longest_first' truncation strategy"
                )
            truncation = "longest_first"

        # Get padding strategy
        if padding is False:
            if max_length is None:
                padding_strategy = PaddingStrategy.LONGEST
            else:
                padding_strategy = PaddingStrategy.MAX_LENGTH
        elif padding is not False:
            if padding is True:
                if verbose:
                    if max_length is not None and (
                            truncation is None or truncation is False or truncation == "do_not_truncate"
                    ):
                        logger.warn(
                            "`max_length` is ignored when `padding`=`True` and there is no truncation strategy. "
                            "To pad to max length, use `padding='max_length'`."
                        )
                padding_strategy = PaddingStrategy.LONGEST  # Default to pad to the longest sequence in the batch
            elif not isinstance(padding, PaddingStrategy):
                padding_strategy = PaddingStrategy(padding)
            elif isinstance(padding, PaddingStrategy):
                padding_strategy = padding
        else:
            padding_strategy = PaddingStrategy.DO_NOT_PAD

        # Get truncation strategy
        if truncation is not None and truncation is not False:
            if truncation is True:
                truncation_strategy = (
                    TruncationStrategy.LONGEST_FIRST
                )  # Default to truncate the longest sequences in pairs of inputs
            elif not isinstance(truncation, TruncationStrategy):
                truncation_strategy = TruncationStrategy(truncation)
            elif isinstance(truncation, TruncationStrategy):
                truncation_strategy = truncation
        else:
            truncation_strategy = TruncationStrategy.DO_NOT_TRUNCATE

        # Set max length if needed
        if max_length is None:
            if padding_strategy == PaddingStrategy.MAX_LENGTH:
                max_length = self.model_max_length
            if truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE:
                max_length = self.model_max_length

        # Test if we have a padding token
        if padding_strategy != PaddingStrategy.DO_NOT_PAD and (not self.pad_token_id):
            raise ValueError(
                "Asking to pad but the tokenizer does not have a padding token. "
                "Please select a token to use as `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` "
                "or add a new pad token via `tokenizer.add_special_tokens({'pad_token': '[PAD]'})`."
            )

        return padding_strategy, truncation_strategy, max_length, kwargs

    def _truncate(self,
                  encoded_inputs: Union[Dict[str, Any],
                                        Dict[str, List]],
                  truncation_strategy: TruncationStrategy,
                  truncation_side: str,
                  max_length: Optional[int] = None):
        if truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE:
            py_assert(max_length is not None, ValueError, 'must pass max_length when truncation is activated!')
            for k, v in encoded_inputs.items():
                seq_ = [seq[:max_length] for seq in v] if truncation_side == 'right' \
                    else [seq[-max_length:] for seq in v]
                encoded_inputs[k] = seq_

        return encoded_inputs

    def pad(
            self,
            encoded_inputs: Union[
                Dict[str, Any],
                Dict[str, List],
            ],
            padding: Union[bool, str, PaddingStrategy] = True,
            truncation: Union[bool, str, TruncationStrategy] = False,
            max_length: Optional[int] = None,
            return_attention_mask: Optional[bool] = None,
            return_tensors: Optional[Union[str, TensorType]] = None,
            verbose: bool = False,
    ) -> BatchEncoding:
        """
        Pad a single encoded input or a batch of encoded inputs up to predefined length or to the max sequence length
        in the batch.

        Padding side (left/right) padding token ids are defined at the tokenizer level (with `self.padding_side`,
        `self.pad_token_id` and `self.pad_token_type_id`).

        Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the
        text followed by a call to the `pad` method to get a padded encoding.

        <Tip>

        If the `encoded_inputs` passed are dictionary of numpy arrays, PyTorch tensors or TensorFlow tensors, the
        result will use the same type unless you provide a different tensor type with `return_tensors`. In the case of
        PyTorch tensors, you will lose the specific device of your tensors however.

        </Tip>

        Args:
            encoded_inputs ([`BatchEncoding`], list of [`BatchEncoding`]:
                Tokenized inputs. Can represent one input ([`BatchEncoding`] or `Dict[str, List[int]]`) or a batch of
                tokenized inputs (list of [`BatchEncoding`], *Dict[str, List[List[int]]]* or *List[Dict[str,
                List[int]]]*) so you can use this method during preprocessing as well as in a PyTorch Dataloader
                collate function.

                Instead of `List[int]` you can have tensors (numpy arrays, PyTorch tensors or TensorFlow tensors), see
                the note above for the return type.
            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`):
                 Select a strategy to pad the returned sequences (according to the model's padding side and padding
                 index) among:

                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
                  sequence if provided).
                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
                  acceptable input length for the model if that argument is not provided.
                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
                  lengths).
            max_length (`int`, *optional*):
                Maximum length of the returned list and optionally padding length (see above).
            return_attention_mask (`bool`, *optional*):
                Whether to return the attention mask. If left to the default, will return the attention mask according
                to the specific tokenizer's default, defined by the `return_outputs` attribute.

            return_tensors (`str` or [`~utils.TensorType`], *optional*):
                If set, will return tensors instead of list of python integers. Acceptable values are:

                - `'tf'`: Return TensorFlow `tf.constant` objects.
                - `'pt'`: Return PyTorch `torch.Tensor` objects.
                - `'np'`: Return Numpy `np.ndarray` objects.
            verbose (`bool`, *optional*, defaults to `True`):
                Whether or not to print more information and warnings.
        """

        # If we have a list of dicts, let's convert it in a dict of lists
        # We do this to allow using this method as a collate_fn function in PyTorch Dataloader
        if isinstance(encoded_inputs, (list, tuple)) and isinstance(encoded_inputs[0], Mapping):
            encoded_inputs = {key: [example[key] for example in encoded_inputs] for key in encoded_inputs[0].keys()}

        # The model's main input name, usually `time_seqs`, has be passed for padding
        if self.model_input_names[0] not in encoded_inputs:
            raise ValueError(
                "You should supply an encoding or a list of encodings to this method "
                f"that includes {self.model_input_names[0]}, but you provided {list(encoded_inputs.keys())}"
            )

        required_input = encoded_inputs[self.model_input_names[0]]

        padding_strategy, truncation_strategy, max_length, _ = self._get_padding_truncation_strategies(
            padding=padding, max_length=max_length, truncation=truncation, verbose=verbose
        )

        encoded_inputs = self._truncate(encoded_inputs,
                                        truncation_strategy=truncation_strategy,
                                        max_length=max_length,
                                        truncation_side=self.truncation_side)

        batch_size = len(required_input)
        assert all(
            len(v) == batch_size for v in encoded_inputs.values()
        ), "Some items in the output dictionary have a different batch size than others."

        if padding_strategy == PaddingStrategy.LONGEST:
            max_length = max(len(inputs) for inputs in required_input)
            padding_strategy = PaddingStrategy.MAX_LENGTH

        batch_output = self._pad(
            encoded_inputs,
            max_length=max_length,
            padding_strategy=padding_strategy,
            return_attention_mask=return_attention_mask,
        )

        return BatchEncoding(batch_output, tensor_type=return_tensors)

    def _pad(
            self,
            encoded_inputs: Union[Dict[str, Any], BatchEncoding],
            max_length: Optional[int] = None,
            padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
            return_attention_mask: Optional[bool] = None,
    ) -> dict:
        """
        Pad encoded inputs (on left/right and up to predefined length or max length in the batch)

        Args:
            encoded_inputs:
                Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`).
            max_length: maximum length of the returned list and optionally padding length (see below).
                Will truncate by taking into account the special tokens.
            padding_strategy: PaddingStrategy to use for padding.

                - PaddingStrategy.LONGEST Pad to the longest sequence in the batch
                - PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
                - PaddingStrategy.DO_NOT_PAD: Do not pad
                The tokenizer padding sides are defined in self.padding_side:

                    - 'left': pads on the left of the sequences
                    - 'right': pads on the right of the sequences
            pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
                This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
                `>= 7.5` (Volta).
            return_attention_mask:
                (optional) Set to False to avoid returning attention mask (default: set to model specifics)
        """
        # Load from model defaults
        if return_attention_mask is None:
            return_attention_mask = "attention_mask" in self.model_input_names

        required_input = encoded_inputs[self.model_input_names[0]]

        if padding_strategy == PaddingStrategy.LONGEST:
            max_length = len(required_input)

        # check whether we need to pad it
        seq_lens = np.array([len(seq) for seq in required_input])
        is_all_seq_equal_max_length = np.all(seq_lens == max_length)
        needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and ~is_all_seq_equal_max_length

        batch_output = dict()

        if needs_to_be_padded:
            # time seqs
            batch_output[self.model_input_names[0]] = self.make_pad_sequence(encoded_inputs[self.model_input_names[0]],
                                                                             self.pad_token_id,
                                                                             padding_side=self.padding_side,
                                                                             max_len=max_length)
            # time_delta seqs
            batch_output[self.model_input_names[1]] = self.make_pad_sequence(encoded_inputs[self.model_input_names[1]],
                                                                             self.pad_token_id,
                                                                             padding_side=self.padding_side,
                                                                             max_len=max_length)
            # type_seqs
            batch_output[self.model_input_names[2]] = self.make_pad_sequence(encoded_inputs[self.model_input_names[2]],
                                                                             self.pad_token_id,
                                                                             padding_side=self.padding_side,
                                                                             max_len=max_length,
                                                                             dtype=np.int64)
        else:
            batch_output[self.model_input_names[0]] = np.array(encoded_inputs[self.model_input_names[0]], dtype=np.float32)
            batch_output[self.model_input_names[1]] = np.array(encoded_inputs[self.model_input_names[1]], dtype=np.float32)
            batch_output[self.model_input_names[2]] = np.array(encoded_inputs[self.model_input_names[2]], dtype=np.int64)

        # non_pad_mask; replaced the use of event types by using the original sequence length
        seq_pad_mask = np.full_like(batch_output[self.model_input_names[2]], fill_value=True, dtype=bool)
        for i, seq_len in enumerate(seq_lens):
            seq_pad_mask[i, seq_len:] = False
        batch_output[self.model_input_names[3]] = seq_pad_mask

        if return_attention_mask:
            # attention_mask
            batch_output[self.model_input_names[4]] = self.make_attn_mask_for_pad_sequence(
                batch_output[self.model_input_names[2]],
                self.pad_token_id)
        else:
            batch_output[self.model_input_names[4]] = []

        return batch_output

    @staticmethod
    def make_pad_sequence(seqs,
                          pad_token_id,
                          padding_side,
                          max_len,
                          dtype=np.float32,
                          group_by_event_types=False):
        """Pad the sequence batch-wise.

        Args:
            seqs (list): list of sequences with variational length
            pad_token_id (int, float): optional, a value that used to pad the sequences. If None, then the pad index
            is set to be the event_num_with_pad
            max_len (int): optional, the maximum length of the sequence after padding. If None, then the
            length is set to be the max length of all input sequences.
            pad_at_end (bool): optional, whether to pad the sequnce at the end. If False,
            the sequence is pad at the beginning

        Returns:
            a numpy array of padded sequence


        Example:
        ```python
        seqs = [[0, 1], [3, 4, 5]]
        pad_sequence(seqs, 100)
        >>> [[0, 1, 100], [3, 4, 5]]

        pad_sequence(seqs, 100, max_len=5)
        >>> [[0, 1, 100, 100, 100], [3, 4, 5, 100, 100]]
        ```

        """
        if not group_by_event_types:
            if padding_side == "right":
                pad_seq = np.array([seq + [pad_token_id] * (max_len - len(seq)) for seq in seqs], dtype=dtype)
            else:
                pad_seq = np.array([[pad_token_id] * (max_len - len(seq)) + seq for seq in seqs], dtype=dtype)
        else:
            pad_seq = []
            for seq in seqs:
                if padding_side == "right":
                    pad_seq.append(np.array([s + [pad_token_id] * (max_len - len(s)) for s in seq], dtype=dtype))
                else:
                    pad_seq.append(np.array([[pad_token_id] * (max_len - len(s)) + s for s in seqs], dtype=dtype))

            pad_seq = np.array(pad_seq)
        return pad_seq

    def make_attn_mask_for_pad_sequence(self, pad_seqs, pad_token_id):
        """Make the attention masks for the sequence.

        Args:
            pad_seqs (tensor): list of sequences that have been padded with fixed length
            pad_token_id (int): optional, a value that used to pad the sequences. If None, then the pad index
            is set to be the event_num_with_pad

        Returns:
            np.array: a bool matrix of the same size of input, denoting the masks of the
            sequence (True: non mask, False: mask)


        Example:
        ```python
        seqs = [[ 1,  6,  0,  7, 12, 12],
        [ 1,  0,  5,  1, 10,  9]]
        make_attn_mask_for_pad_sequence(seqs, pad_index=12)
        >>>
            batch_non_pad_mask
            ([[ True,  True,  True,  True, False, False],
            [ True,  True,  True,  True,  True,  True]])
            attention_mask
            [[[ False  True  True  True  True  True]
              [False  False  True  True  True  True]
              [False False  False  True  True  True]
              [False False False  False  True  True]
              [False False False False  True  True]
              [False False False False  True  True]]

             [[False  True  True  True  True  True]
              [False  False  True  True  True  True]
              [False False  False  True  True  True]
              [False False False  False  True  True]
              [False False False False  False  True]
              [False False False False False  False]]]
        ```


        """

        seq_num, seq_len = pad_seqs.shape

        # [batch_size, seq_len]
        seq_pad_mask = pad_seqs == pad_token_id

        # [batch_size, seq_len, seq_len]
        attention_key_pad_mask = np.tile(seq_pad_mask[:, None, :], (1, seq_len, 1))
        subsequent_mask = np.tile(np.triu(np.ones((seq_len, seq_len), dtype=bool), k=1)[None, :, :], (seq_num, 1, 1))

        attention_mask = subsequent_mask | attention_key_pad_mask

        return attention_mask

    def make_type_mask_for_pad_sequence(self, pad_seqs):
        """Make the type mask.

        Args:
            pad_seqs (tensor): a list of sequence events with equal length (i.e., padded sequence)

        Returns:
            np.array: a 3-dim matrix, where the last dim (one-hot vector) indicates the type of event

        """
        type_mask = np.zeros([*pad_seqs.shape, self.num_event_types], dtype=np.int32)
        for i in range(self.num_event_types):
            type_mask[:, :, i] = pad_seqs == i

        return type_mask