nmstech commited on
Commit
fcd513a
Β·
verified Β·
1 Parent(s): 58e6961

Add smart ACRONYM detection: TDK-based disambiguation for uppercase tokens

Browse files
turk_tokenizer/_acronym_dict.py CHANGED
@@ -83,13 +83,27 @@ ACRONYM_EXPANSIONS: dict[str, str] = {
83
 
84
 
85
  def reclassify_acronyms(tokens: list[dict]) -> list[dict]:
86
- """Add ``_expansion`` field to known acronyms in the token stream."""
87
  result: list[dict] = []
88
  for tok in tokens:
89
- if tok["type"] == "ROOT" and (tok.get("_acronym") or tok.get("_caps")):
90
- expansion = ACRONYM_EXPANSIONS.get(tok["token"].strip().upper())
 
 
 
91
  if expansion:
92
  result.append({**tok, "_expansion": expansion, "_known_acronym": True})
93
- continue
94
- result.append(tok)
 
 
 
 
 
 
 
 
 
 
 
95
  return result
 
83
 
84
 
85
  def reclassify_acronyms(tokens: list[dict]) -> list[dict]:
86
+ """Add ``_expansion`` to known acronyms; promote CAPS ROOTs to ACRONYM."""
87
  result: list[dict] = []
88
  for tok in tokens:
89
+ token_upper = tok["token"].strip().upper()
90
+ expansion = ACRONYM_EXPANSIONS.get(token_upper)
91
+
92
+ if tok["type"] == "ACRONYM":
93
+ # Already typed as ACRONYM by span detection β€” add expansion
94
  if expansion:
95
  result.append({**tok, "_expansion": expansion, "_known_acronym": True})
96
+ else:
97
+ result.append(tok)
98
+ elif tok["type"] == "ROOT" and (tok.get("_acronym") or tok.get("_caps")):
99
+ # ALL CAPS ROOT that's in the acronym dict β†’ promote to ACRONYM
100
+ if expansion:
101
+ result.append({
102
+ **tok, "type": "ACRONYM",
103
+ "_expansion": expansion, "_known_acronym": True,
104
+ })
105
+ else:
106
+ result.append(tok)
107
+ else:
108
+ result.append(tok)
109
  return result
turk_tokenizer/_normalizer.py CHANGED
@@ -78,6 +78,22 @@ NUMBER_RE = re.compile(
78
  TIME_RE = re.compile(r'\d{1,2}:\d{2}(?::\d{2})?')
79
  PLAIN_NUM_RE = re.compile(r'\b\d+\b')
80
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
  TEXT_EMOJI_RE = re.compile(r'[:;=]-?[\)\(\]\[dDpPoO3]|<3')
82
  UNICODE_EMOJI_RE = re.compile(
83
  "[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF"
@@ -89,20 +105,60 @@ UNICODE_EMOJI_RE = re.compile(
89
 
90
  # Pattern priority: earlier entries win when spans overlap.
91
  _SPAN_PATTERNS: list[tuple[re.Pattern, str]] = [
92
- (URL_RE, "URL"),
93
- (MENTION_RE, "MENTION"),
94
- (HASHTAG_RE, "HASHTAG"),
95
- (DATE_RE, "DATE"),
96
- (CURRENCY_RE, "UNIT"),
97
- (NUM_APOSTROPHE_RE, "NUM_APO"),
98
- (NUMBER_RE, "NUM"),
99
- (TIME_RE, "NUM"),
100
- (PLAIN_NUM_RE, "NUM"),
101
- (UNICODE_EMOJI_RE, "EMOJI"),
102
- (TEXT_EMOJI_RE, "EMOJI"),
 
 
103
  ]
104
 
105
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
  # ── Segment-based API ────────────────────────────────────────────────────────
107
 
108
  def find_special_spans(text: str) -> list[tuple[int, int, str, str]]:
@@ -114,7 +170,22 @@ def find_special_spans(text: str) -> list[tuple[int, int, str, str]]:
114
  candidates: list[tuple[int, int, str, str]] = []
115
  for pattern, ttype in _SPAN_PATTERNS:
116
  for m in pattern.finditer(text):
117
- candidates.append((m.start(), m.end(), ttype, m.group(0)))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
118
 
119
  # Sort by start position, then prefer longer match
120
  candidates.sort(key=lambda x: (x[0], -(x[1] - x[0])))
@@ -129,36 +200,56 @@ def find_special_spans(text: str) -> list[tuple[int, int, str, str]]:
129
  return result
130
 
131
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
132
  def make_special_tokens(span_type: str, original: str) -> list[dict]:
133
  """Create token dict(s) for a matched special span.
134
 
135
- ``NUM_APO`` spans are split into a NUM token + SUFFIX token(s).
136
  """
 
137
  if span_type == "NUM_APO":
138
  apo_pos = original.find("'")
139
  if apo_pos == -1:
140
  apo_pos = original.find("\u2019")
141
  num_part = original[:apo_pos]
142
- suffix_str = original[apo_pos + 1:]
143
-
144
- tokens: list[dict] = [{"token": f" {num_part}", "type": "NUM", "_num": True}]
145
-
146
- # Split suffix_str into individual Turkish suffixes
147
- remaining = suffix_str.lower()
148
- while remaining:
149
- matched = False
150
- for s in _NUM_SUFFIXES:
151
- if remaining.startswith(s):
152
- tokens.append({"token": s, "type": "SUFFIX", "_apo_suffix": True})
153
- remaining = remaining[len(s):]
154
- matched = True
155
- break
156
- if not matched:
157
- # Safety fallback β€” shouldn't happen if the regex matched
158
- tokens.append({"token": remaining, "type": "SUFFIX", "_apo_suffix": True})
159
- break
160
- return tokens
161
 
 
162
  return [{
163
  "token": f" {original}",
164
  "type": span_type,
 
78
  TIME_RE = re.compile(r'\d{1,2}:\d{2}(?::\d{2})?')
79
  PLAIN_NUM_RE = re.compile(r'\b\d+\b')
80
 
81
+ # ── Acronym patterns ─────────────────────────────────────────────────────────
82
+ # Matches standalone uppercase sequences (+ optional trailing digits).
83
+ # [A-Z]{2,}[0-9]* β†’ HTML, GPT, CSS3, HTML5, MP3
84
+ # [A-Z][0-9]+ β†’ F16, H264, A4
85
+ # Does NOT match mixed-case words (ChatGPT) because \b won't fire mid-word.
86
+ ACRONYM_RE = re.compile(
87
+ r"\b[A-ZΓ‡ΔžΔ°Γ–ΕžΓœ]{2,}[0-9]*\b"
88
+ r"|\b[A-ZΓ‡ΔžΔ°Γ–ΕžΓœ][0-9]+\b"
89
+ )
90
+
91
+ # Acronym followed by apostrophe + Turkish suffix(es): NATO'nun, HTML5'ten
92
+ ACRONYM_APOSTROPHE_RE = re.compile(
93
+ r"\b(?:[A-ZΓ‡ΔžΔ°Γ–ΕžΓœ]{2,}[0-9]*|[A-ZΓ‡ΔžΔ°Γ–ΕžΓœ][0-9]+)['\u2019](?:"
94
+ + _SUFFIX_ALT + r")+\b"
95
+ )
96
+
97
  TEXT_EMOJI_RE = re.compile(r'[:;=]-?[\)\(\]\[dDpPoO3]|<3')
98
  UNICODE_EMOJI_RE = re.compile(
99
  "[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF"
 
105
 
106
  # Pattern priority: earlier entries win when spans overlap.
107
  _SPAN_PATTERNS: list[tuple[re.Pattern, str]] = [
108
+ (URL_RE, "URL"),
109
+ (MENTION_RE, "MENTION"),
110
+ (HASHTAG_RE, "HASHTAG"),
111
+ (DATE_RE, "DATE"),
112
+ (CURRENCY_RE, "UNIT"),
113
+ (NUM_APOSTROPHE_RE, "NUM_APO"),
114
+ (ACRONYM_APOSTROPHE_RE, "ACRONYM_APO"),
115
+ (ACRONYM_RE, "ACRONYM"),
116
+ (NUMBER_RE, "NUM"),
117
+ (TIME_RE, "NUM"),
118
+ (PLAIN_NUM_RE, "NUM"),
119
+ (UNICODE_EMOJI_RE, "EMOJI"),
120
+ (TEXT_EMOJI_RE, "EMOJI"),
121
  ]
122
 
123
 
124
+ # ── Acronym vs Turkish word disambiguation ───────────────────────────────────
125
+
126
+ def _is_known_turkish_word(word_upper: str) -> bool:
127
+ """Return True if *word_upper* (ALL CAPS) is a known Turkish word.
128
+
129
+ Checks (in order):
130
+ 1. ACRONYM_EXPANSIONS dict β†’ always acronym (return False)
131
+ 2. Same dict without trailing digits (HTML5 β†’ HTML)
132
+ 3. TDK dictionary β†’ Turkish word (return True)
133
+ 4. Proper nouns list β†’ Turkish word (return True)
134
+ 5. Otherwise β†’ treat as acronym (return False)
135
+ """
136
+ from ._acronym_dict import ACRONYM_EXPANSIONS # noqa: PLC0415
137
+ from ._preprocessor import _turkish_lower, _load_proper_nouns # noqa: PLC0415
138
+ from ._tdk_vocab import load_tdk_words # noqa: PLC0415
139
+
140
+ # Known acronyms always win
141
+ if word_upper in ACRONYM_EXPANSIONS:
142
+ return False
143
+ # Also check without trailing digits (HTML5 β†’ HTML)
144
+ base = word_upper.rstrip("0123456789")
145
+ if base and base != word_upper and base in ACRONYM_EXPANSIONS:
146
+ return False
147
+
148
+ wl = _turkish_lower(word_upper)
149
+
150
+ # TDK dictionary: if the lowercase form is a real Turkish word β†’ not acronym
151
+ tdk = load_tdk_words()
152
+ if tdk and wl in tdk:
153
+ return True
154
+
155
+ # Proper nouns (Δ°stanbul, Ankara…)
156
+ if wl in _load_proper_nouns():
157
+ return True
158
+
159
+ return False
160
+
161
+
162
  # ── Segment-based API ────────────────────────────────────────────────────────
163
 
164
  def find_special_spans(text: str) -> list[tuple[int, int, str, str]]:
 
170
  candidates: list[tuple[int, int, str, str]] = []
171
  for pattern, ttype in _SPAN_PATTERNS:
172
  for m in pattern.finditer(text):
173
+ original = m.group(0)
174
+
175
+ # Acronym filtering: skip if it's actually a Turkish word
176
+ if ttype in ("ACRONYM", "ACRONYM_APO"):
177
+ # Extract the uppercase base (before apostrophe for APO)
178
+ if ttype == "ACRONYM_APO":
179
+ apo = original.find("'")
180
+ if apo == -1:
181
+ apo = original.find("\u2019")
182
+ acr_base = original[:apo]
183
+ else:
184
+ acr_base = original
185
+ if _is_known_turkish_word(acr_base):
186
+ continue
187
+
188
+ candidates.append((m.start(), m.end(), ttype, original))
189
 
190
  # Sort by start position, then prefer longer match
191
  candidates.sort(key=lambda x: (x[0], -(x[1] - x[0])))
 
200
  return result
201
 
202
 
203
+ def _split_apostrophe_suffixes(suffix_str: str) -> list[dict]:
204
+ """Split a suffix string (after apostrophe) into individual SUFFIX tokens."""
205
+ tokens: list[dict] = []
206
+ remaining = suffix_str.lower()
207
+ while remaining:
208
+ matched = False
209
+ for s in _NUM_SUFFIXES:
210
+ if remaining.startswith(s):
211
+ tokens.append({"token": s, "type": "SUFFIX", "_apo_suffix": True})
212
+ remaining = remaining[len(s):]
213
+ matched = True
214
+ break
215
+ if not matched:
216
+ tokens.append({"token": remaining, "type": "SUFFIX", "_apo_suffix": True})
217
+ break
218
+ return tokens
219
+
220
+
221
  def make_special_tokens(span_type: str, original: str) -> list[dict]:
222
  """Create token dict(s) for a matched special span.
223
 
224
+ ``NUM_APO`` and ``ACRONYM_APO`` spans are split into base + SUFFIX tokens.
225
  """
226
+ # ── Number + apostrophe + suffix (3'te, 1990'larda) ──────────────────
227
  if span_type == "NUM_APO":
228
  apo_pos = original.find("'")
229
  if apo_pos == -1:
230
  apo_pos = original.find("\u2019")
231
  num_part = original[:apo_pos]
232
+ return [
233
+ {"token": f" {num_part}", "type": "NUM", "_num": True},
234
+ *_split_apostrophe_suffixes(original[apo_pos + 1:]),
235
+ ]
236
+
237
+ # ── Acronym + apostrophe + suffix (NATO'nun, HTML5'ten) ──────────────
238
+ if span_type == "ACRONYM_APO":
239
+ apo_pos = original.find("'")
240
+ if apo_pos == -1:
241
+ apo_pos = original.find("\u2019")
242
+ acr_part = original[:apo_pos]
243
+ return [
244
+ {"token": f" {acr_part}", "type": "ACRONYM", "_acronym": True},
245
+ *_split_apostrophe_suffixes(original[apo_pos + 1:]),
246
+ ]
247
+
248
+ # ── Plain acronym (HTML5, GPT) ──────────────────────────────────────
249
+ if span_type == "ACRONYM":
250
+ return [{"token": f" {original}", "type": "ACRONYM", "_acronym": True}]
251
 
252
+ # ── Everything else (NUM, DATE, URL, MENTION, HASHTAG, EMOJI, UNIT) ──
253
  return [{
254
  "token": f" {original}",
255
  "type": span_type,
turk_tokenizer/tokenizer.py CHANGED
@@ -56,13 +56,13 @@ _DOMAIN_ROOTS_LOWER = {k.lower() for k in ALL_DOMAIN_ROOTS}
56
  # ── Token types ───────────────────────────────────────────────────────────────
57
 
58
  _SPECIAL_TYPES = frozenset(
59
- ("NUM", "DATE", "UNIT", "URL", "MENTION", "HASHTAG", "EMOJI")
60
  )
61
 
62
  _TYPE_SYM = {
63
  "ROOT": "R", "SUFFIX": "S", "FOREIGN": "F", "BPE": "B", "PUNCT": "P",
64
  "NUM": "N", "DATE": "D", "UNIT": "U",
65
- "URL": "L", "MENTION": "@", "HASHTAG": "#", "EMOJI": "E",
66
  }
67
 
68
 
 
56
  # ── Token types ───────────────────────────────────────────────────────────────
57
 
58
  _SPECIAL_TYPES = frozenset(
59
+ ("NUM", "DATE", "UNIT", "URL", "MENTION", "HASHTAG", "EMOJI", "ACRONYM")
60
  )
61
 
62
  _TYPE_SYM = {
63
  "ROOT": "R", "SUFFIX": "S", "FOREIGN": "F", "BPE": "B", "PUNCT": "P",
64
  "NUM": "N", "DATE": "D", "UNIT": "U",
65
+ "URL": "L", "MENTION": "@", "HASHTAG": "#", "EMOJI": "E", "ACRONYM": "A",
66
  }
67
 
68