Add smart ACRONYM detection: TDK-based disambiguation for uppercase tokens
Browse files- turk_tokenizer/_acronym_dict.py +19 -5
- turk_tokenizer/_normalizer.py +123 -32
- turk_tokenizer/tokenizer.py +2 -2
turk_tokenizer/_acronym_dict.py
CHANGED
|
@@ -83,13 +83,27 @@ ACRONYM_EXPANSIONS: dict[str, str] = {
|
|
| 83 |
|
| 84 |
|
| 85 |
def reclassify_acronyms(tokens: list[dict]) -> list[dict]:
|
| 86 |
-
"""Add ``_expansion``
|
| 87 |
result: list[dict] = []
|
| 88 |
for tok in tokens:
|
| 89 |
-
|
| 90 |
-
|
|
|
|
|
|
|
|
|
|
| 91 |
if expansion:
|
| 92 |
result.append({**tok, "_expansion": expansion, "_known_acronym": True})
|
| 93 |
-
|
| 94 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 95 |
return result
|
|
|
|
| 83 |
|
| 84 |
|
| 85 |
def reclassify_acronyms(tokens: list[dict]) -> list[dict]:
|
| 86 |
+
"""Add ``_expansion`` to known acronyms; promote CAPS ROOTs to ACRONYM."""
|
| 87 |
result: list[dict] = []
|
| 88 |
for tok in tokens:
|
| 89 |
+
token_upper = tok["token"].strip().upper()
|
| 90 |
+
expansion = ACRONYM_EXPANSIONS.get(token_upper)
|
| 91 |
+
|
| 92 |
+
if tok["type"] == "ACRONYM":
|
| 93 |
+
# Already typed as ACRONYM by span detection β add expansion
|
| 94 |
if expansion:
|
| 95 |
result.append({**tok, "_expansion": expansion, "_known_acronym": True})
|
| 96 |
+
else:
|
| 97 |
+
result.append(tok)
|
| 98 |
+
elif tok["type"] == "ROOT" and (tok.get("_acronym") or tok.get("_caps")):
|
| 99 |
+
# ALL CAPS ROOT that's in the acronym dict β promote to ACRONYM
|
| 100 |
+
if expansion:
|
| 101 |
+
result.append({
|
| 102 |
+
**tok, "type": "ACRONYM",
|
| 103 |
+
"_expansion": expansion, "_known_acronym": True,
|
| 104 |
+
})
|
| 105 |
+
else:
|
| 106 |
+
result.append(tok)
|
| 107 |
+
else:
|
| 108 |
+
result.append(tok)
|
| 109 |
return result
|
turk_tokenizer/_normalizer.py
CHANGED
|
@@ -78,6 +78,22 @@ NUMBER_RE = re.compile(
|
|
| 78 |
TIME_RE = re.compile(r'\d{1,2}:\d{2}(?::\d{2})?')
|
| 79 |
PLAIN_NUM_RE = re.compile(r'\b\d+\b')
|
| 80 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 81 |
TEXT_EMOJI_RE = re.compile(r'[:;=]-?[\)\(\]\[dDpPoO3]|<3')
|
| 82 |
UNICODE_EMOJI_RE = re.compile(
|
| 83 |
"[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF"
|
|
@@ -89,20 +105,60 @@ UNICODE_EMOJI_RE = re.compile(
|
|
| 89 |
|
| 90 |
# Pattern priority: earlier entries win when spans overlap.
|
| 91 |
_SPAN_PATTERNS: list[tuple[re.Pattern, str]] = [
|
| 92 |
-
(URL_RE,
|
| 93 |
-
(MENTION_RE,
|
| 94 |
-
(HASHTAG_RE,
|
| 95 |
-
(DATE_RE,
|
| 96 |
-
(CURRENCY_RE,
|
| 97 |
-
(NUM_APOSTROPHE_RE,
|
| 98 |
-
(
|
| 99 |
-
(
|
| 100 |
-
(
|
| 101 |
-
(
|
| 102 |
-
(
|
|
|
|
|
|
|
| 103 |
]
|
| 104 |
|
| 105 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 106 |
# ββ Segment-based API ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 107 |
|
| 108 |
def find_special_spans(text: str) -> list[tuple[int, int, str, str]]:
|
|
@@ -114,7 +170,22 @@ def find_special_spans(text: str) -> list[tuple[int, int, str, str]]:
|
|
| 114 |
candidates: list[tuple[int, int, str, str]] = []
|
| 115 |
for pattern, ttype in _SPAN_PATTERNS:
|
| 116 |
for m in pattern.finditer(text):
|
| 117 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 118 |
|
| 119 |
# Sort by start position, then prefer longer match
|
| 120 |
candidates.sort(key=lambda x: (x[0], -(x[1] - x[0])))
|
|
@@ -129,36 +200,56 @@ def find_special_spans(text: str) -> list[tuple[int, int, str, str]]:
|
|
| 129 |
return result
|
| 130 |
|
| 131 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 132 |
def make_special_tokens(span_type: str, original: str) -> list[dict]:
|
| 133 |
"""Create token dict(s) for a matched special span.
|
| 134 |
|
| 135 |
-
``NUM_APO`` spans are split into
|
| 136 |
"""
|
|
|
|
| 137 |
if span_type == "NUM_APO":
|
| 138 |
apo_pos = original.find("'")
|
| 139 |
if apo_pos == -1:
|
| 140 |
apo_pos = original.find("\u2019")
|
| 141 |
num_part = original[:apo_pos]
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
return
|
| 161 |
|
|
|
|
| 162 |
return [{
|
| 163 |
"token": f" {original}",
|
| 164 |
"type": span_type,
|
|
|
|
| 78 |
TIME_RE = re.compile(r'\d{1,2}:\d{2}(?::\d{2})?')
|
| 79 |
PLAIN_NUM_RE = re.compile(r'\b\d+\b')
|
| 80 |
|
| 81 |
+
# ββ Acronym patterns βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 82 |
+
# Matches standalone uppercase sequences (+ optional trailing digits).
|
| 83 |
+
# [A-Z]{2,}[0-9]* β HTML, GPT, CSS3, HTML5, MP3
|
| 84 |
+
# [A-Z][0-9]+ β F16, H264, A4
|
| 85 |
+
# Does NOT match mixed-case words (ChatGPT) because \b won't fire mid-word.
|
| 86 |
+
ACRONYM_RE = re.compile(
|
| 87 |
+
r"\b[A-ZΓΔΔ°ΓΕΓ]{2,}[0-9]*\b"
|
| 88 |
+
r"|\b[A-ZΓΔΔ°ΓΕΓ][0-9]+\b"
|
| 89 |
+
)
|
| 90 |
+
|
| 91 |
+
# Acronym followed by apostrophe + Turkish suffix(es): NATO'nun, HTML5'ten
|
| 92 |
+
ACRONYM_APOSTROPHE_RE = re.compile(
|
| 93 |
+
r"\b(?:[A-ZΓΔΔ°ΓΕΓ]{2,}[0-9]*|[A-ZΓΔΔ°ΓΕΓ][0-9]+)['\u2019](?:"
|
| 94 |
+
+ _SUFFIX_ALT + r")+\b"
|
| 95 |
+
)
|
| 96 |
+
|
| 97 |
TEXT_EMOJI_RE = re.compile(r'[:;=]-?[\)\(\]\[dDpPoO3]|<3')
|
| 98 |
UNICODE_EMOJI_RE = re.compile(
|
| 99 |
"[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF"
|
|
|
|
| 105 |
|
| 106 |
# Pattern priority: earlier entries win when spans overlap.
|
| 107 |
_SPAN_PATTERNS: list[tuple[re.Pattern, str]] = [
|
| 108 |
+
(URL_RE, "URL"),
|
| 109 |
+
(MENTION_RE, "MENTION"),
|
| 110 |
+
(HASHTAG_RE, "HASHTAG"),
|
| 111 |
+
(DATE_RE, "DATE"),
|
| 112 |
+
(CURRENCY_RE, "UNIT"),
|
| 113 |
+
(NUM_APOSTROPHE_RE, "NUM_APO"),
|
| 114 |
+
(ACRONYM_APOSTROPHE_RE, "ACRONYM_APO"),
|
| 115 |
+
(ACRONYM_RE, "ACRONYM"),
|
| 116 |
+
(NUMBER_RE, "NUM"),
|
| 117 |
+
(TIME_RE, "NUM"),
|
| 118 |
+
(PLAIN_NUM_RE, "NUM"),
|
| 119 |
+
(UNICODE_EMOJI_RE, "EMOJI"),
|
| 120 |
+
(TEXT_EMOJI_RE, "EMOJI"),
|
| 121 |
]
|
| 122 |
|
| 123 |
|
| 124 |
+
# ββ Acronym vs Turkish word disambiguation βββββββββββββββββββββββββββββββββββ
|
| 125 |
+
|
| 126 |
+
def _is_known_turkish_word(word_upper: str) -> bool:
|
| 127 |
+
"""Return True if *word_upper* (ALL CAPS) is a known Turkish word.
|
| 128 |
+
|
| 129 |
+
Checks (in order):
|
| 130 |
+
1. ACRONYM_EXPANSIONS dict β always acronym (return False)
|
| 131 |
+
2. Same dict without trailing digits (HTML5 β HTML)
|
| 132 |
+
3. TDK dictionary β Turkish word (return True)
|
| 133 |
+
4. Proper nouns list β Turkish word (return True)
|
| 134 |
+
5. Otherwise β treat as acronym (return False)
|
| 135 |
+
"""
|
| 136 |
+
from ._acronym_dict import ACRONYM_EXPANSIONS # noqa: PLC0415
|
| 137 |
+
from ._preprocessor import _turkish_lower, _load_proper_nouns # noqa: PLC0415
|
| 138 |
+
from ._tdk_vocab import load_tdk_words # noqa: PLC0415
|
| 139 |
+
|
| 140 |
+
# Known acronyms always win
|
| 141 |
+
if word_upper in ACRONYM_EXPANSIONS:
|
| 142 |
+
return False
|
| 143 |
+
# Also check without trailing digits (HTML5 β HTML)
|
| 144 |
+
base = word_upper.rstrip("0123456789")
|
| 145 |
+
if base and base != word_upper and base in ACRONYM_EXPANSIONS:
|
| 146 |
+
return False
|
| 147 |
+
|
| 148 |
+
wl = _turkish_lower(word_upper)
|
| 149 |
+
|
| 150 |
+
# TDK dictionary: if the lowercase form is a real Turkish word β not acronym
|
| 151 |
+
tdk = load_tdk_words()
|
| 152 |
+
if tdk and wl in tdk:
|
| 153 |
+
return True
|
| 154 |
+
|
| 155 |
+
# Proper nouns (Δ°stanbul, Ankaraβ¦)
|
| 156 |
+
if wl in _load_proper_nouns():
|
| 157 |
+
return True
|
| 158 |
+
|
| 159 |
+
return False
|
| 160 |
+
|
| 161 |
+
|
| 162 |
# ββ Segment-based API ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 163 |
|
| 164 |
def find_special_spans(text: str) -> list[tuple[int, int, str, str]]:
|
|
|
|
| 170 |
candidates: list[tuple[int, int, str, str]] = []
|
| 171 |
for pattern, ttype in _SPAN_PATTERNS:
|
| 172 |
for m in pattern.finditer(text):
|
| 173 |
+
original = m.group(0)
|
| 174 |
+
|
| 175 |
+
# Acronym filtering: skip if it's actually a Turkish word
|
| 176 |
+
if ttype in ("ACRONYM", "ACRONYM_APO"):
|
| 177 |
+
# Extract the uppercase base (before apostrophe for APO)
|
| 178 |
+
if ttype == "ACRONYM_APO":
|
| 179 |
+
apo = original.find("'")
|
| 180 |
+
if apo == -1:
|
| 181 |
+
apo = original.find("\u2019")
|
| 182 |
+
acr_base = original[:apo]
|
| 183 |
+
else:
|
| 184 |
+
acr_base = original
|
| 185 |
+
if _is_known_turkish_word(acr_base):
|
| 186 |
+
continue
|
| 187 |
+
|
| 188 |
+
candidates.append((m.start(), m.end(), ttype, original))
|
| 189 |
|
| 190 |
# Sort by start position, then prefer longer match
|
| 191 |
candidates.sort(key=lambda x: (x[0], -(x[1] - x[0])))
|
|
|
|
| 200 |
return result
|
| 201 |
|
| 202 |
|
| 203 |
+
def _split_apostrophe_suffixes(suffix_str: str) -> list[dict]:
|
| 204 |
+
"""Split a suffix string (after apostrophe) into individual SUFFIX tokens."""
|
| 205 |
+
tokens: list[dict] = []
|
| 206 |
+
remaining = suffix_str.lower()
|
| 207 |
+
while remaining:
|
| 208 |
+
matched = False
|
| 209 |
+
for s in _NUM_SUFFIXES:
|
| 210 |
+
if remaining.startswith(s):
|
| 211 |
+
tokens.append({"token": s, "type": "SUFFIX", "_apo_suffix": True})
|
| 212 |
+
remaining = remaining[len(s):]
|
| 213 |
+
matched = True
|
| 214 |
+
break
|
| 215 |
+
if not matched:
|
| 216 |
+
tokens.append({"token": remaining, "type": "SUFFIX", "_apo_suffix": True})
|
| 217 |
+
break
|
| 218 |
+
return tokens
|
| 219 |
+
|
| 220 |
+
|
| 221 |
def make_special_tokens(span_type: str, original: str) -> list[dict]:
|
| 222 |
"""Create token dict(s) for a matched special span.
|
| 223 |
|
| 224 |
+
``NUM_APO`` and ``ACRONYM_APO`` spans are split into base + SUFFIX tokens.
|
| 225 |
"""
|
| 226 |
+
# ββ Number + apostrophe + suffix (3'te, 1990'larda) ββββββββββββββββββ
|
| 227 |
if span_type == "NUM_APO":
|
| 228 |
apo_pos = original.find("'")
|
| 229 |
if apo_pos == -1:
|
| 230 |
apo_pos = original.find("\u2019")
|
| 231 |
num_part = original[:apo_pos]
|
| 232 |
+
return [
|
| 233 |
+
{"token": f" {num_part}", "type": "NUM", "_num": True},
|
| 234 |
+
*_split_apostrophe_suffixes(original[apo_pos + 1:]),
|
| 235 |
+
]
|
| 236 |
+
|
| 237 |
+
# ββ Acronym + apostrophe + suffix (NATO'nun, HTML5'ten) ββββββββββββββ
|
| 238 |
+
if span_type == "ACRONYM_APO":
|
| 239 |
+
apo_pos = original.find("'")
|
| 240 |
+
if apo_pos == -1:
|
| 241 |
+
apo_pos = original.find("\u2019")
|
| 242 |
+
acr_part = original[:apo_pos]
|
| 243 |
+
return [
|
| 244 |
+
{"token": f" {acr_part}", "type": "ACRONYM", "_acronym": True},
|
| 245 |
+
*_split_apostrophe_suffixes(original[apo_pos + 1:]),
|
| 246 |
+
]
|
| 247 |
+
|
| 248 |
+
# ββ Plain acronym (HTML5, GPT) ββββββββββββββββββββββββββββββββββββββ
|
| 249 |
+
if span_type == "ACRONYM":
|
| 250 |
+
return [{"token": f" {original}", "type": "ACRONYM", "_acronym": True}]
|
| 251 |
|
| 252 |
+
# ββ Everything else (NUM, DATE, URL, MENTION, HASHTAG, EMOJI, UNIT) ββ
|
| 253 |
return [{
|
| 254 |
"token": f" {original}",
|
| 255 |
"type": span_type,
|
turk_tokenizer/tokenizer.py
CHANGED
|
@@ -56,13 +56,13 @@ _DOMAIN_ROOTS_LOWER = {k.lower() for k in ALL_DOMAIN_ROOTS}
|
|
| 56 |
# ββ Token types βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 57 |
|
| 58 |
_SPECIAL_TYPES = frozenset(
|
| 59 |
-
("NUM", "DATE", "UNIT", "URL", "MENTION", "HASHTAG", "EMOJI")
|
| 60 |
)
|
| 61 |
|
| 62 |
_TYPE_SYM = {
|
| 63 |
"ROOT": "R", "SUFFIX": "S", "FOREIGN": "F", "BPE": "B", "PUNCT": "P",
|
| 64 |
"NUM": "N", "DATE": "D", "UNIT": "U",
|
| 65 |
-
"URL": "L", "MENTION": "@", "HASHTAG": "#", "EMOJI": "E",
|
| 66 |
}
|
| 67 |
|
| 68 |
|
|
|
|
| 56 |
# ββ Token types βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 57 |
|
| 58 |
_SPECIAL_TYPES = frozenset(
|
| 59 |
+
("NUM", "DATE", "UNIT", "URL", "MENTION", "HASHTAG", "EMOJI", "ACRONYM")
|
| 60 |
)
|
| 61 |
|
| 62 |
_TYPE_SYM = {
|
| 63 |
"ROOT": "R", "SUFFIX": "S", "FOREIGN": "F", "BPE": "B", "PUNCT": "P",
|
| 64 |
"NUM": "N", "DATE": "D", "UNIT": "U",
|
| 65 |
+
"URL": "L", "MENTION": "@", "HASHTAG": "#", "EMOJI": "E", "ACRONYM": "A",
|
| 66 |
}
|
| 67 |
|
| 68 |
|