File size: 1,240 Bytes
473c3a0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
from string import punctuation

from tokenizers import Regex, Tokenizer
from tokenizers.normalizers import Replace, Sequence, Strip


def replace_normalizer(
    tokenizer: Tokenizer,
) -> Tokenizer:
    """
    Replace the normalizer for the tokenizer.

    The new normalizer will replace punctuation with a space before and after the punctuation.
    It will also replace multiple spaces with a single space and strip the right side of the string.
    If the tokenizer already has a normalizer, it will be added to the new normalizer.
    If the tokenizer does not have a normalizer, a new normalizer will be created.

    :param tokenizer: The tokenizer to change.
    :return: The tokenizer with a replaced normalizer.
    """
    normalizer = tokenizer.normalizer
    new_normalizers = []
    for char in punctuation:
        new_normalizers.append(Replace(char, f" {char} "))

    new_normalizers.append(Replace(Regex(r"\s+"), " "))
    new_normalizers.append(Strip(right=True))
    if normalizer is None:
        normalizer = Sequence(new_normalizers)  # type: ignore
    else:
        normalizer = Sequence([normalizer, *new_normalizers])  # type: ignore
    tokenizer.normalizer = normalizer  # type: ignore

    return tokenizer