File size: 1,559 Bytes
d670799 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 |
# Copyright (c) OpenMMLab. All rights reserved.
from typing import List, Optional
from transformers import BertTokenizer
from mmaction.registry import TOKENIZER
class VindLUTokenizer(BertTokenizer):
"""VindLUTokenizer inherit BertTokenizer.
The main difference from BertTokenizer is removing the last separate token
for a single sequence.
"""
def build_inputs_with_special_tokens(
self,
token_ids_0: List[int],
token_ids_1: Optional[List[int]] = None) -> List[int]:
"""Build model inputs from a sequence or a pair of sequence for
sequence classification tasks by concatenating and adding special
tokens. A BERT sequence has the following format:
- single sequence: `[CLS] X`
- pair of sequences: `[CLS] A [SEP] B [SEP]`
Args:
token_ids_0 (`List[int]`):
List of IDs to which the special tokens will be added.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of [input IDs](../glossary#input-ids) with
the appropriate special tokens.
"""
if token_ids_1 is None:
return [self.cls_token_id] + token_ids_0
cls = [self.cls_token_id]
sep = [self.sep_token_id]
return cls + token_ids_0 + sep + token_ids_1 + sep
TOKENIZER.register_module(
'VindLUTokenizer', module=VindLUTokenizer.from_pretrained)
|