Spaces:

prity-k
/

hindi-bpe-tokenizer

Sleeping

Upload 6 files

7a55d6f verified 2 months ago

661 Bytes

	"""Text preprocessing for Hindi BPE: NFC normalization and grapheme clustering."""

	import unicodedata

	import regex


	def normalize_text(text: str) -> str:
	"""Apply NFC normalization for canonical Unicode composition."""
	return unicodedata.normalize("NFC", text)


	def split_graphemes(text: str) -> list[str]:
	"""Split text into Unicode grapheme clusters (orthographic syllables for Devanagari)."""
	normalized = normalize_text(text)
	return regex.findall(r"\X", normalized)


	def pretokenize_graphemes(text: str) -> list[str]:
	"""Normalize and split into grapheme clusters. Use as BPE pretokenize_fn."""
	return split_graphemes(text)