tokenizer_class.py · dvitvaai/pothana-chat-300M at main

Upload folder using huggingface_hub

ebb32fd verified 19 days ago

1.43 kB

	"""Custom Telugu tokenizer that handles @@ continuation marker stripping."""
	from transformers import PreTrainedTokenizerFast


	class TeluguTokenizer(PreTrainedTokenizerFast):
	"""Telugu tokenizer with Morfessor @@ continuation marker support.

	Tokens ending with @@ are continuation pieces that join to the next token.
	This class overrides decode() to strip @@ markers and join morphemes:
	"రెడ్డి@@ గారు" → "రెడ్డిగారు"

	Also strips chat special tokens (<\|system\|>, <\|user\|>, <\|assistant\|>, <\|end\|>)
	from decoded output for clean text.
	"""

	# Chat special tokens to strip from output
	_CHAT_SPECIALS = ["<\|system\|>", "<\|user\|>", "<\|assistant\|>", "<\|end\|>"]

	def decode(self, token_ids, skip_special_tokens=False, **kwargs):
	text = super().decode(token_ids, skip_special_tokens=skip_special_tokens, **kwargs)
	# Strip @@ continuation markers:
	# "@@ " between tokens means "join to next token" (no space)
	text = text.replace("@@ ", "")
	# Handle remaining @@ (before punctuation, end of string, etc.)
	text = text.replace("@@", "")
	# Strip chat special tokens
	for special in self._CHAT_SPECIALS:
	text = text.replace(special, "")
	# Clean up extra whitespace from removed tokens
	import re
	text = re.sub(r" +", " ", text).strip()
	return text