File size: 6,217 Bytes
edec8b7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 | """Tokenization engine β orchestrates the full pipeline.
This is the central pipeline that ties together all modules:
1. Text normalization (Unicode, whitespace)
2. ALL CAPS detection and lowercasing
3. Special span extraction (URLs, numbers, dates, acronyms, emojis)
4. Word-level segmentation with candidate generation/selection
5. Post-annotation (allomorph labels, compound info, acronym expansion)
6. Number/unit reclassification safety net
"""
from __future__ import annotations
from ._domain_vocab import ALL_DOMAIN_ROOTS
from .morphology import annotate_acronyms, annotate_canonical, annotate_compounds
from .normalization import detect_all_caps, normalize_text
from .resources import load_tdk_words
from .segmentation import segment_word, split_into_words
from .special_spans import find_special_spans, make_special_tokens, reclassify_numbers_in_tokens
class TokenizationEngine:
"""Core tokenization engine.
Stateless after initialisation: loads TDK and domain vocabulary once,
then processes texts through a deterministic pipeline.
This class is NOT the public API. Use ``NedoTurkishTokenizer``
instead, which delegates to this engine.
"""
def __init__(self) -> None:
self._tdk: set[str] = load_tdk_words()
self._domain_roots: frozenset[str] = ALL_DOMAIN_ROOTS
def tokenize(self, text: str) -> list[dict[str, object]]:
"""Run the full tokenization pipeline on *text*.
Returns a list of token dicts, each with at minimum:
``token``, ``token_type``, ``morph_pos``.
"""
if not text or not text.strip():
return []
# ββ 1. Normalize βββββββββββββββββββββββββββββββββββββββββββββββββ
text = normalize_text(text)
# ββ 2. ALL CAPS detection ββββββββββββββββββββββββββββββββββββββββ
text, caps_set = detect_all_caps(text)
# ββ 3. Special span extraction βββββββββββββββββββββββββββββββββββ
spans = find_special_spans(text)
tokens: list[dict[str, object]] = []
pos = 0
for start, end, span_type, original in spans:
# Tokenize normal text before this special span
if pos < start:
segment = text[pos:start]
if segment.strip():
seg_tokens = self._tokenize_segment(segment, caps_set)
tokens.extend(seg_tokens)
# Insert special tokens directly
tokens.extend(make_special_tokens(span_type, original))
pos = end
# Tokenize remaining text after last special span
if pos < len(text):
segment = text[pos:]
if segment.strip():
seg_tokens = self._tokenize_segment(segment, caps_set)
tokens.extend(seg_tokens)
# ββ 5. Post-annotation passes ββββββββββββββββββββββββββββββββββββ
tokens = reclassify_numbers_in_tokens(tokens)
tokens = annotate_canonical(tokens)
tokens = annotate_compounds(tokens)
tokens = annotate_acronyms(tokens)
# ββ 6. Finalize morph_pos ββββββββββββββββββββββββββββββββββββββββ
tokens = _compute_morph_pos(tokens)
# ββ 7. Strip internal leading spaces from token text βββββββββββββ
# Leading spaces are used internally to detect word boundaries
# during morph_pos computation but are NOT part of the public API.
tokens = _strip_token_text(tokens)
return tokens
def _tokenize_segment(
self, segment: str, caps_set: frozenset[str]
) -> list[dict[str, object]]:
"""Tokenize a plain-text segment (no special spans)."""
words = split_into_words(segment)
tokens: list[dict[str, object]] = []
for word in words:
word_tokens = segment_word(
word, self._tdk, self._domain_roots, caps_set
)
tokens.extend(word_tokens)
return tokens
# ββ Helper: compute morph_pos across the full token stream βββββββββββββββββββ
def _compute_morph_pos(tokens: list[dict[str, object]]) -> list[dict[str, object]]:
"""Recompute ``morph_pos`` consistently across the token stream.
Rules:
- Word-initial tokens (leading space, special types, PUNCT) β morph_pos = 0
- SUFFIX tokens increment the position counter
- Apostrophe suffixes continue from the previous word
"""
result: list[dict[str, object]] = []
word_pos = 0
for tok in tokens:
raw = str(tok["token"])
token_type = str(tok["token_type"])
is_word_start = raw.startswith(" ") or raw.strip().startswith("<")
# Apostrophe suffixes continue the previous word
if tok.get("_apo_suffix"):
is_word_start = False
if is_word_start or token_type in (
"NUM", "DATE", "UNIT", "URL", "MENTION", "HASHTAG", "EMOJI", "ACRONYM", "PUNCT"
):
word_pos = 0
morph_pos = 0
elif token_type == "SUFFIX":
word_pos += 1
morph_pos = word_pos
else:
# ROOT or FOREIGN within a word (shouldn't normally happen)
word_pos = 0
morph_pos = 0
result.append({**tok, "morph_pos": morph_pos})
return result
def _strip_token_text(tokens: list[dict[str, object]]) -> list[dict[str, object]]:
"""Remove internal leading whitespace from all token text strings.
During pipeline processing, a leading space in ``token`` signals
a word-initial token. Once ``morph_pos`` has been computed, this
space is no longer needed and must be stripped so the public API
returns clean text.
"""
return [{**tok, "token": str(tok["token"]).lstrip()} for tok in tokens]
|