Sarthak
chore: moved model2vec as in internal package
473c3a0
from string import punctuation
from tokenizers import Regex, Tokenizer
from tokenizers.normalizers import Replace, Sequence, Strip
def replace_normalizer(
tokenizer: Tokenizer,
) -> Tokenizer:
"""
Replace the normalizer for the tokenizer.
The new normalizer will replace punctuation with a space before and after the punctuation.
It will also replace multiple spaces with a single space and strip the right side of the string.
If the tokenizer already has a normalizer, it will be added to the new normalizer.
If the tokenizer does not have a normalizer, a new normalizer will be created.
:param tokenizer: The tokenizer to change.
:return: The tokenizer with a replaced normalizer.
"""
normalizer = tokenizer.normalizer
new_normalizers = []
for char in punctuation:
new_normalizers.append(Replace(char, f" {char} "))
new_normalizers.append(Replace(Regex(r"\s+"), " "))
new_normalizers.append(Strip(right=True))
if normalizer is None:
normalizer = Sequence(new_normalizers) # type: ignore
else:
normalizer = Sequence([normalizer, *new_normalizers]) # type: ignore
tokenizer.normalizer = normalizer # type: ignore
return tokenizer