|
|
|
|
|
from typing import List, Optional
|
|
|
|
|
|
from transformers import BertTokenizer
|
|
|
|
|
|
from mmaction.registry import TOKENIZER
|
|
|
|
|
|
|
|
|
class VindLUTokenizer(BertTokenizer):
|
|
|
"""VindLUTokenizer inherit BertTokenizer.
|
|
|
|
|
|
The main difference from BertTokenizer is removing the last separate token
|
|
|
for a single sequence.
|
|
|
"""
|
|
|
|
|
|
def build_inputs_with_special_tokens(
|
|
|
self,
|
|
|
token_ids_0: List[int],
|
|
|
token_ids_1: Optional[List[int]] = None) -> List[int]:
|
|
|
"""Build model inputs from a sequence or a pair of sequence for
|
|
|
sequence classification tasks by concatenating and adding special
|
|
|
tokens. A BERT sequence has the following format:
|
|
|
|
|
|
- single sequence: `[CLS] X`
|
|
|
- pair of sequences: `[CLS] A [SEP] B [SEP]`
|
|
|
|
|
|
Args:
|
|
|
token_ids_0 (`List[int]`):
|
|
|
List of IDs to which the special tokens will be added.
|
|
|
token_ids_1 (`List[int]`, *optional*):
|
|
|
Optional second list of IDs for sequence pairs.
|
|
|
|
|
|
Returns:
|
|
|
`List[int]`: List of [input IDs](../glossary#input-ids) with
|
|
|
the appropriate special tokens.
|
|
|
"""
|
|
|
if token_ids_1 is None:
|
|
|
return [self.cls_token_id] + token_ids_0
|
|
|
cls = [self.cls_token_id]
|
|
|
sep = [self.sep_token_id]
|
|
|
return cls + token_ids_0 + sep + token_ids_1 + sep
|
|
|
|
|
|
|
|
|
TOKENIZER.register_module(
|
|
|
'VindLUTokenizer', module=VindLUTokenizer.from_pretrained)
|
|
|
|