Spaces:
Sleeping
Sleeping
| """Text preprocessing for Hindi BPE: NFC normalization and grapheme clustering.""" | |
| import unicodedata | |
| import regex | |
| def normalize_text(text: str) -> str: | |
| """Apply NFC normalization for canonical Unicode composition.""" | |
| return unicodedata.normalize("NFC", text) | |
| def split_graphemes(text: str) -> list[str]: | |
| """Split text into Unicode grapheme clusters (orthographic syllables for Devanagari).""" | |
| normalized = normalize_text(text) | |
| return regex.findall(r"\X", normalized) | |
| def pretokenize_graphemes(text: str) -> list[str]: | |
| """Normalize and split into grapheme clusters. Use as BPE pretokenize_fn.""" | |
| return split_graphemes(text) | |