dvitvaai
/

pothana-sp-base-300M

Text Generation

text-generation-inference

Model card Files Files and versions

pothana-sp-base-300M / tokenizer_class.py

neshkatrapati's picture

Upload folder using huggingface_hub

3f80d0c verified 16 days ago

history blame contribute delete

770 Bytes

	"""Custom Telugu SentencePiece tokenizer with clean decode output."""
	from transformers import LlamaTokenizer


	class TeluguSPTokenizer(LlamaTokenizer):
	"""Telugu SentencePiece tokenizer that produces clean decoded text.

	Extends LlamaTokenizer to clean up SentencePiece's ▁ (U+2581) word
	boundary markers, producing natural text output.
	"""

	def decode(self, token_ids, skip_special_tokens=False, **kwargs):
	text = super().decode(token_ids, skip_special_tokens=skip_special_tokens, **kwargs)
	# SentencePiece uses ▁ (U+2581) as word boundary marker
	# Replace with space, then clean up
	text = text.replace("\u2581", " ")
	# Clean up extra whitespace
	text = " ".join(text.split())
	return text