"""Text preprocessing for Hindi BPE: NFC normalization and grapheme clustering."""

import unicodedata

import regex


def normalize_text(text: str) -> str:
    """Apply NFC normalization for canonical Unicode composition."""
    return unicodedata.normalize("NFC", text)


def split_graphemes(text: str) -> list[str]:
    """Split text into Unicode grapheme clusters (orthographic syllables for Devanagari)."""
    normalized = normalize_text(text)
    return regex.findall(r"\X", normalized)


def pretokenize_graphemes(text: str) -> list[str]:
    """Normalize and split into grapheme clusters. Use as BPE pretokenize_fn."""
    return split_graphemes(text)