Sarthak

chore: moved model2vec as in internal package

473c3a0 8 months ago

1.24 kB

	from string import punctuation

	from tokenizers import Regex, Tokenizer
	from tokenizers.normalizers import Replace, Sequence, Strip


	def replace_normalizer(
	tokenizer: Tokenizer,
	) -> Tokenizer:
	"""
	Replace the normalizer for the tokenizer.

	The new normalizer will replace punctuation with a space before and after the punctuation.
	It will also replace multiple spaces with a single space and strip the right side of the string.
	If the tokenizer already has a normalizer, it will be added to the new normalizer.
	If the tokenizer does not have a normalizer, a new normalizer will be created.

	:param tokenizer: The tokenizer to change.
	:return: The tokenizer with a replaced normalizer.
	"""
	normalizer = tokenizer.normalizer
	new_normalizers = []
	for char in punctuation:
	new_normalizers.append(Replace(char, f" {char} "))

	new_normalizers.append(Replace(Regex(r"\s+"), " "))
	new_normalizers.append(Strip(right=True))
	if normalizer is None:
	normalizer = Sequence(new_normalizers) # type: ignore
	else:
	normalizer = Sequence([normalizer, *new_normalizers]) # type: ignore
	tokenizer.normalizer = normalizer # type: ignore

	return tokenizer