File size: 2,011 Bytes

edec8b7

"""Text normalization utilities for Turkish text.

Handles:
- Turkish-aware lowercasing (İ→i, I→ı)
- Unicode NFC normalization
- Whitespace cleanup
- ALL CAPS word detection and lowercasing
"""

from __future__ import annotations

import re
import unicodedata

# Turkish-specific characters — presence indicates a Turkish word
TR_CHARS: frozenset[str] = frozenset("çğışöüÇĞİŞÖÜ")

# Pattern for detecting ALL CAPS words (≥2 uppercase letters)
_CAPS_RE = re.compile(r"\b([A-ZÇĞİÖŞÜ]{2,})\b")


def turkish_lower(s: str) -> str:
    """Turkish-aware lowercase: İ→i, I→ı, then standard ``str.lower()``.

    Standard Python ``str.lower()`` maps both I and İ to 'i', which is
    wrong for Turkish where I→ı and İ→i.
    """
    return s.replace("İ", "i").replace("I", "ı").lower()


def normalize_text(text: str) -> str:
    """Apply Unicode NFC normalization and collapse whitespace."""
    text = unicodedata.normalize("NFC", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text


def has_turkish_chars(word: str) -> bool:
    """Return True if *word* contains Turkish-specific characters (ç,ğ,ı,ş,ö,ü)."""
    return any(c in TR_CHARS for c in word)


def detect_all_caps(text: str) -> tuple[str, frozenset[str]]:
    """Detect ALL CAPS words, lowercase them, and return the modified text.

    ALL CAPS words like ``İSTANBUL`` cause problems for suffix-based
    segmentation because the suffix table works on lowercase text.  This
    function lowercases them in-place and returns a set of the lowered
    forms so the output tokens can be annotated with ``_caps=True``.

    Returns:
        ``(modified_text, frozenset_of_lowered_caps_words)``
    """
    caps_collector: set[str] = set()

    def _replace(m: re.Match) -> str:
        word = m.group(1)
        lowered = turkish_lower(word)
        caps_collector.add(lowered)
        return lowered

    modified = _CAPS_RE.sub(_replace, text)
    return modified, frozenset(caps_collector)