dvitvaai
/

pothana-base-300M

Text Generation

text-generation-inference

Model card Files Files and versions

pothana-base-300M / tokenizer_class.py

neshkatrapati's picture

Upload folder using huggingface_hub

a6034dd verified 21 days ago

history blame contribute delete

932 Bytes

	"""Custom Telugu tokenizer that handles @@ continuation marker stripping."""
	from transformers import PreTrainedTokenizerFast


	class TeluguTokenizer(PreTrainedTokenizerFast):
	"""Telugu tokenizer with Morfessor @@ continuation marker support.

	Tokens ending with @@ are continuation pieces that join to the next token.
	This class overrides decode() to strip @@ markers and join morphemes:
	"రెడ్డి@@ గారు" → "రెడ్డిగారు"
	"""

	def decode(self, token_ids, skip_special_tokens=False, **kwargs):
	text = super().decode(token_ids, skip_special_tokens=skip_special_tokens, **kwargs)
	# Strip @@ continuation markers:
	# "@@ " between tokens means "join to next token" (no space)
	text = text.replace("@@ ", "")
	# Handle remaining @@ (before punctuation, end of string, etc.)
	text = text.replace("@@", "")
	return text