File size: 2,011 Bytes
edec8b7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 | """Text normalization utilities for Turkish text.
Handles:
- Turkish-aware lowercasing (İ→i, I→ı)
- Unicode NFC normalization
- Whitespace cleanup
- ALL CAPS word detection and lowercasing
"""
from __future__ import annotations
import re
import unicodedata
# Turkish-specific characters — presence indicates a Turkish word
TR_CHARS: frozenset[str] = frozenset("çğışöüÇĞİŞÖÜ")
# Pattern for detecting ALL CAPS words (≥2 uppercase letters)
_CAPS_RE = re.compile(r"\b([A-ZÇĞİÖŞÜ]{2,})\b")
def turkish_lower(s: str) -> str:
"""Turkish-aware lowercase: İ→i, I→ı, then standard ``str.lower()``.
Standard Python ``str.lower()`` maps both I and İ to 'i', which is
wrong for Turkish where I→ı and İ→i.
"""
return s.replace("İ", "i").replace("I", "ı").lower()
def normalize_text(text: str) -> str:
"""Apply Unicode NFC normalization and collapse whitespace."""
text = unicodedata.normalize("NFC", text)
text = re.sub(r"\s+", " ", text).strip()
return text
def has_turkish_chars(word: str) -> bool:
"""Return True if *word* contains Turkish-specific characters (ç,ğ,ı,ş,ö,ü)."""
return any(c in TR_CHARS for c in word)
def detect_all_caps(text: str) -> tuple[str, frozenset[str]]:
"""Detect ALL CAPS words, lowercase them, and return the modified text.
ALL CAPS words like ``İSTANBUL`` cause problems for suffix-based
segmentation because the suffix table works on lowercase text. This
function lowercases them in-place and returns a set of the lowered
forms so the output tokens can be annotated with ``_caps=True``.
Returns:
``(modified_text, frozenset_of_lowered_caps_words)``
"""
caps_collector: set[str] = set()
def _replace(m: re.Match) -> str:
word = m.group(1)
lowered = turkish_lower(word)
caps_collector.add(lowered)
return lowered
modified = _CAPS_RE.sub(_replace, text)
return modified, frozenset(caps_collector)
|