"""Text preprocessing for Hindi BPE: NFC normalization and grapheme clustering.""" import unicodedata import regex def normalize_text(text: str) -> str: """Apply NFC normalization for canonical Unicode composition.""" return unicodedata.normalize("NFC", text) def split_graphemes(text: str) -> list[str]: """Split text into Unicode grapheme clusters (orthographic syllables for Devanagari).""" normalized = normalize_text(text) return regex.findall(r"\X", normalized) def pretokenize_graphemes(text: str) -> list[str]: """Normalize and split into grapheme clusters. Use as BPE pretokenize_fn.""" return split_graphemes(text)