Ethosoft
Refactor to standalone v2.0: zero dependencies, internal engine, removed zemberek/hf wrapper
edec8b7 | """Text normalization utilities for Turkish text. | |
| Handles: | |
| - Turkish-aware lowercasing (İ→i, I→ı) | |
| - Unicode NFC normalization | |
| - Whitespace cleanup | |
| - ALL CAPS word detection and lowercasing | |
| """ | |
| from __future__ import annotations | |
| import re | |
| import unicodedata | |
| # Turkish-specific characters — presence indicates a Turkish word | |
| TR_CHARS: frozenset[str] = frozenset("çğışöüÇĞİŞÖÜ") | |
| # Pattern for detecting ALL CAPS words (≥2 uppercase letters) | |
| _CAPS_RE = re.compile(r"\b([A-ZÇĞİÖŞÜ]{2,})\b") | |
| def turkish_lower(s: str) -> str: | |
| """Turkish-aware lowercase: İ→i, I→ı, then standard ``str.lower()``. | |
| Standard Python ``str.lower()`` maps both I and İ to 'i', which is | |
| wrong for Turkish where I→ı and İ→i. | |
| """ | |
| return s.replace("İ", "i").replace("I", "ı").lower() | |
| def normalize_text(text: str) -> str: | |
| """Apply Unicode NFC normalization and collapse whitespace.""" | |
| text = unicodedata.normalize("NFC", text) | |
| text = re.sub(r"\s+", " ", text).strip() | |
| return text | |
| def has_turkish_chars(word: str) -> bool: | |
| """Return True if *word* contains Turkish-specific characters (ç,ğ,ı,ş,ö,ü).""" | |
| return any(c in TR_CHARS for c in word) | |
| def detect_all_caps(text: str) -> tuple[str, frozenset[str]]: | |
| """Detect ALL CAPS words, lowercase them, and return the modified text. | |
| ALL CAPS words like ``İSTANBUL`` cause problems for suffix-based | |
| segmentation because the suffix table works on lowercase text. This | |
| function lowercases them in-place and returns a set of the lowered | |
| forms so the output tokens can be annotated with ``_caps=True``. | |
| Returns: | |
| ``(modified_text, frozenset_of_lowered_caps_words)`` | |
| """ | |
| caps_collector: set[str] = set() | |
| def _replace(m: re.Match) -> str: | |
| word = m.group(1) | |
| lowered = turkish_lower(word) | |
| caps_collector.add(lowered) | |
| return lowered | |
| modified = _CAPS_RE.sub(_replace, text) | |
| return modified, frozenset(caps_collector) | |