hindi-bpe-tokenizer / preprocess.py
prity-k's picture
Upload 6 files
7a55d6f verified
"""Text preprocessing for Hindi BPE: NFC normalization and grapheme clustering."""
import unicodedata
import regex
def normalize_text(text: str) -> str:
"""Apply NFC normalization for canonical Unicode composition."""
return unicodedata.normalize("NFC", text)
def split_graphemes(text: str) -> list[str]:
"""Split text into Unicode grapheme clusters (orthographic syllables for Devanagari)."""
normalized = normalize_text(text)
return regex.findall(r"\X", normalized)
def pretokenize_graphemes(text: str) -> list[str]:
"""Normalize and split into grapheme clusters. Use as BPE pretokenize_fn."""
return split_graphemes(text)