""" This file defines functions used to modify the default behaviour
of transformers.AutoTokenizer. These changes are necessary, because some
tokenizers are meant to be used with raw text, while the OntoNotes documents
have already been split into words.
All the functions are used in coref_model.CorefModel._get_docs. """


# Filters out unwanted tokens produced by the tokenizer
TOKENIZER_FILTERS = {
    "albert-xxlarge-v2": (lambda token: token != "▁"),  # U+2581, not just "_"
    "albert-large-v2": (lambda token: token != "▁"),
}

# Maps some words to tokens directly, without a tokenizer
TOKENIZER_MAPS = {
    "roberta-large": {".": ["."], ",": [","], "!": ["!"], "?": ["?"],
                      ":":[":"], ";":[";"], "'s": ["'s"]}
}