""" This file defines functions used to modify the default behaviour of transformers.AutoTokenizer. These changes are necessary, because some tokenizers are meant to be used with raw text, while the OntoNotes documents have already been split into words. All the functions are used in coref_model.CorefModel._get_docs. """ # Filters out unwanted tokens produced by the tokenizer TOKENIZER_FILTERS = { "albert-xxlarge-v2": (lambda token: token != "▁"), # U+2581, not just "_" "albert-large-v2": (lambda token: token != "▁"), } # Maps some words to tokens directly, without a tokenizer TOKENIZER_MAPS = { "roberta-large": {".": ["."], ",": [","], "!": ["!"], "?": ["?"], ":":[":"], ";":[";"], "'s": ["'s"]} }