nmstech commited on
Commit
58e6961
Β·
verified Β·
1 Parent(s): 986e073

Fix broken placeholder mechanism: replace with segment-based tokenization

Browse files
.claude/settings.local.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "permissions": {
3
+ "allow": [
4
+ "Bash(git add:*)",
5
+ "Bash(git push:*)",
6
+ "Bash(git remote:*)",
7
+ "Bash(git fetch:*)",
8
+ "Bash(hf whoami:*)",
9
+ "Bash(huggingface-cli whoami:*)",
10
+ "Bash(python3:*)",
11
+ "Bash(git lfs:*)",
12
+ "Bash(sudo apt-get:*)"
13
+ ]
14
+ }
15
+ }
.gitignore ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ __pycache__/
2
+ *.pyc
3
+ *.pyo
4
+ *.egg-info/
5
+ dist/
6
+ build/
7
+ .eggs/
8
+ *.egg
9
+ .env
10
+ .venv/
turk_tokenizer/_normalizer.py CHANGED
@@ -1,4 +1,8 @@
1
- """Fix 8: Special token normalization (NUM, DATE, URL, MENTION, HASHTAG, EMOJI)."""
 
 
 
 
2
 
3
  from __future__ import annotations
4
 
@@ -26,21 +30,54 @@ ROMAN_NUMERALS = {
26
  "xi","xii","xiii","xiv","xv","xvi","xvii","xviii","xix","xx",
27
  }
28
 
 
 
29
  URL_RE = re.compile(r'https?://\S+|www\.\S+', re.IGNORECASE)
30
  MENTION_RE = re.compile(r'@[\w\u00C0-\u024F]+')
31
  HASHTAG_RE = re.compile(r'#[\w\u00C0-\u024F]+')
32
- NUMBER_RE = re.compile(
33
- r'%\d+[\.,]?\d*'
34
- r'|\d+[\.,]\d+'
35
- r'|\d{1,3}(?:\.\d{3})+'
36
- r'|\d+%'
37
- r'|\d+/\d+'
 
 
 
 
 
 
 
 
 
 
 
 
38
  )
 
 
 
 
 
 
 
 
 
39
  DATE_RE = re.compile(
40
  r'\d{1,2}[./\-]\d{1,2}[./\-]\d{2,4}'
41
  r'|\d{4}[./\-]\d{1,2}[./\-]\d{1,2}'
42
  )
43
  CURRENCY_RE = re.compile(r'[$€£Β₯β‚Ίβ‚½]\d+[\.,]?\d*|\d+[\.,]?\d*[$€£Β₯β‚Ίβ‚½]')
 
 
 
 
 
 
 
 
 
 
44
  TEXT_EMOJI_RE = re.compile(r'[:;=]-?[\)\(\]\[dDpPoO3]|<3')
45
  UNICODE_EMOJI_RE = re.compile(
46
  "[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF"
@@ -50,62 +87,89 @@ UNICODE_EMOJI_RE = re.compile(
50
  flags=re.UNICODE,
51
  )
52
 
53
-
54
- def preprocess_special_tokens(text: str) -> tuple[str, list[dict]]:
55
- """Replace special tokens with placeholders before base tokenization."""
56
- placeholders: list[dict] = []
57
- counter = [0]
58
-
59
- def _ph(token_type: str, original: str) -> str:
60
- ph = f"\x00{token_type}{counter[0]}\x00"
61
- placeholders.append({"placeholder": ph, "type": token_type, "original": original})
62
- counter[0] += 1
63
- return ph
64
-
65
- def _replace(pattern: re.Pattern, ttype: str, t: str) -> str:
66
- return pattern.sub(lambda m: _ph(ttype, m.group(0)), t)
67
-
68
- text = _replace(URL_RE, "URL", text)
69
- text = _replace(MENTION_RE, "MENTION", text)
70
- text = _replace(HASHTAG_RE, "HASHTAG", text)
71
- text = _replace(DATE_RE, "DATE", text)
72
- text = _replace(CURRENCY_RE, "UNIT", text)
73
- text = _replace(NUMBER_RE, "NUM", text)
74
- text = _replace(UNICODE_EMOJI_RE, "EMOJI", text)
75
- text = _replace(TEXT_EMOJI_RE, "EMOJI", text)
76
- return text, placeholders
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
 
78
 
79
- def restore_special_tokens(tokens: list[dict], placeholders: list[dict]) -> list[dict]:
80
- """Restore placeholders in the token stream."""
81
- if not placeholders:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
  return tokens
83
 
84
- ph_map = {p["placeholder"]: p for p in placeholders}
85
- restored: set[str] = set()
86
- result: list[dict] = []
 
 
87
 
88
- for tok in tokens:
89
- raw = tok["token"]
90
- matched = next(((ph, info) for ph, info in ph_map.items() if ph in raw), None)
91
- if matched:
92
- ph, info = matched
93
- if ph not in restored:
94
- restored.add(ph)
95
- ttype = info["type"]
96
- result.append({
97
- "token": f" {info['original']}",
98
- "type": ttype,
99
- f"_{ttype.lower()}": True,
100
- })
101
- else:
102
- result.append(tok)
103
-
104
- return result
105
 
 
106
 
107
  def reclassify_numbers_in_tokens(tokens: list[dict]) -> list[dict]:
108
- """Catch remaining number/unit tokens missed by pre-tokenization."""
109
  result: list[dict] = []
110
  for tok in tokens:
111
  if tok["type"] not in ("BPE", "ROOT"):
 
1
+ """Fix 8: Special token normalization (NUM, DATE, URL, MENTION, HASHTAG, EMOJI).
2
+
3
+ Uses a segment-based approach: special tokens are detected and extracted
4
+ *before* the base tokenizer runs, so they never pass through it.
5
+ """
6
 
7
  from __future__ import annotations
8
 
 
30
  "xi","xii","xiii","xiv","xv","xvi","xvii","xviii","xix","xx",
31
  }
32
 
33
+ # ── Regex patterns ────────────────────────────────────────────────────────────
34
+
35
  URL_RE = re.compile(r'https?://\S+|www\.\S+', re.IGNORECASE)
36
  MENTION_RE = re.compile(r'@[\w\u00C0-\u024F]+')
37
  HASHTAG_RE = re.compile(r'#[\w\u00C0-\u024F]+')
38
+
39
+ # Turkish suffixes that can follow a number+apostrophe
40
+ _NUM_SUFFIXES = sorted(
41
+ [
42
+ "nΔ±n","nin","nun","nΓΌn","dan","den","tan","ten",
43
+ "da","de","ta","te","ya","ye","nda","nde",
44
+ "yΔ±","yi","yu","yΓΌ","nΔ±","ni","nu","nΓΌ",
45
+ "lar","ler","lara","lere","larΔ±","leri",
46
+ "Δ±m","im","um","ΓΌm","Δ±n","in","un","ΓΌn",
47
+ "mΔ±z","miz","muz","mΓΌz","nΔ±z","niz","nuz","nΓΌz",
48
+ "dΔ±r","dir","dur","dΓΌr","tΔ±r","tir","tur","tΓΌr",
49
+ "ki","li","lΔ±","lu","lΓΌ","sΔ±z","siz","suz","sΓΌz",
50
+ "inci","Δ±ncΔ±","uncu","ΓΌncΓΌ","nci","ncΔ±",
51
+ "lΔ±k","lik","luk","lΓΌk",
52
+ "a","e","Δ±","i","u","ΓΌ",
53
+ ],
54
+ key=len,
55
+ reverse=True,
56
  )
57
+
58
+ _SUFFIX_ALT = '|'.join(re.escape(s) for s in _NUM_SUFFIXES)
59
+
60
+ # Number (or time) followed by apostrophe + Turkish suffix(es)
61
+ NUM_APOSTROPHE_RE = re.compile(
62
+ r"\d+(?:[.:,]\d+)*['\u2019](?:" + _SUFFIX_ALT + r")+\b",
63
+ re.IGNORECASE,
64
+ )
65
+
66
  DATE_RE = re.compile(
67
  r'\d{1,2}[./\-]\d{1,2}[./\-]\d{2,4}'
68
  r'|\d{4}[./\-]\d{1,2}[./\-]\d{1,2}'
69
  )
70
  CURRENCY_RE = re.compile(r'[$€£Β₯β‚Ίβ‚½]\d+[\.,]?\d*|\d+[\.,]?\d*[$€£Β₯β‚Ίβ‚½]')
71
+ NUMBER_RE = re.compile(
72
+ r'%\d+[\.,]?\d*'
73
+ r'|\d{1,3}(?:\.\d{3})+' # thousands (1.000.000) β€” before decimal!
74
+ r'|\d+[\.,]\d+' # decimal (2.5, 10,5)
75
+ r'|\d+%'
76
+ r'|\d+/\d+'
77
+ )
78
+ TIME_RE = re.compile(r'\d{1,2}:\d{2}(?::\d{2})?')
79
+ PLAIN_NUM_RE = re.compile(r'\b\d+\b')
80
+
81
  TEXT_EMOJI_RE = re.compile(r'[:;=]-?[\)\(\]\[dDpPoO3]|<3')
82
  UNICODE_EMOJI_RE = re.compile(
83
  "[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF"
 
87
  flags=re.UNICODE,
88
  )
89
 
90
+ # Pattern priority: earlier entries win when spans overlap.
91
+ _SPAN_PATTERNS: list[tuple[re.Pattern, str]] = [
92
+ (URL_RE, "URL"),
93
+ (MENTION_RE, "MENTION"),
94
+ (HASHTAG_RE, "HASHTAG"),
95
+ (DATE_RE, "DATE"),
96
+ (CURRENCY_RE, "UNIT"),
97
+ (NUM_APOSTROPHE_RE, "NUM_APO"),
98
+ (NUMBER_RE, "NUM"),
99
+ (TIME_RE, "NUM"),
100
+ (PLAIN_NUM_RE, "NUM"),
101
+ (UNICODE_EMOJI_RE, "EMOJI"),
102
+ (TEXT_EMOJI_RE, "EMOJI"),
103
+ ]
104
+
105
+
106
+ # ── Segment-based API ────────────────────────────────────────────────────────
107
+
108
+ def find_special_spans(text: str) -> list[tuple[int, int, str, str]]:
109
+ """Find all special-token spans in *text*.
110
+
111
+ Returns a sorted, non-overlapping list of
112
+ ``(start, end, token_type, original_text)``.
113
+ """
114
+ candidates: list[tuple[int, int, str, str]] = []
115
+ for pattern, ttype in _SPAN_PATTERNS:
116
+ for m in pattern.finditer(text):
117
+ candidates.append((m.start(), m.end(), ttype, m.group(0)))
118
+
119
+ # Sort by start position, then prefer longer match
120
+ candidates.sort(key=lambda x: (x[0], -(x[1] - x[0])))
121
+
122
+ # Greedy non-overlapping selection
123
+ result: list[tuple[int, int, str, str]] = []
124
+ last_end = 0
125
+ for s, e, t, o in candidates:
126
+ if s >= last_end:
127
+ result.append((s, e, t, o))
128
+ last_end = e
129
+ return result
130
 
131
 
132
+ def make_special_tokens(span_type: str, original: str) -> list[dict]:
133
+ """Create token dict(s) for a matched special span.
134
+
135
+ ``NUM_APO`` spans are split into a NUM token + SUFFIX token(s).
136
+ """
137
+ if span_type == "NUM_APO":
138
+ apo_pos = original.find("'")
139
+ if apo_pos == -1:
140
+ apo_pos = original.find("\u2019")
141
+ num_part = original[:apo_pos]
142
+ suffix_str = original[apo_pos + 1:]
143
+
144
+ tokens: list[dict] = [{"token": f" {num_part}", "type": "NUM", "_num": True}]
145
+
146
+ # Split suffix_str into individual Turkish suffixes
147
+ remaining = suffix_str.lower()
148
+ while remaining:
149
+ matched = False
150
+ for s in _NUM_SUFFIXES:
151
+ if remaining.startswith(s):
152
+ tokens.append({"token": s, "type": "SUFFIX", "_apo_suffix": True})
153
+ remaining = remaining[len(s):]
154
+ matched = True
155
+ break
156
+ if not matched:
157
+ # Safety fallback β€” shouldn't happen if the regex matched
158
+ tokens.append({"token": remaining, "type": "SUFFIX", "_apo_suffix": True})
159
+ break
160
  return tokens
161
 
162
+ return [{
163
+ "token": f" {original}",
164
+ "type": span_type,
165
+ f"_{span_type.lower()}": True,
166
+ }]
167
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
168
 
169
+ # ── Safety-net post-pass ─────────────────────────────────────────────────────
170
 
171
  def reclassify_numbers_in_tokens(tokens: list[dict]) -> list[dict]:
172
+ """Catch remaining number/unit tokens missed by span detection."""
173
  result: list[dict] = []
174
  for tok in tokens:
175
  if tok["type"] not in ("BPE", "ROOT"):
turk_tokenizer/tokenizer.py CHANGED
@@ -37,8 +37,8 @@ from ._root_validator import validate_roots, ZEMBEREK_AVAILABLE
37
  from ._medical_vocab import ALL_DOMAIN_ROOTS
38
  from ._tdk_vocab import reclassify_foreign_words
39
  from ._normalizer import (
40
- preprocess_special_tokens,
41
- restore_special_tokens,
42
  reclassify_numbers_in_tokens,
43
  )
44
  from ._allomorph import add_canonical_labels
@@ -114,20 +114,35 @@ class TurkTokenizer:
114
  Returns a list of token dicts, each with:
115
  ``token``, ``token_type``, ``morph_pos``, and optional ``_*`` fields.
116
  """
117
- # Fix 8 pre: replace URLs, mentions, numbers etc. with placeholders
118
- text_norm, specials = preprocess_special_tokens(text)
119
-
120
- # Fix 1 & 2 pre: ALL CAPS + apostrophe
121
- processed, caps_map, apo_splits = preprocess(text_norm)
122
-
123
- # Base tokenizer
124
- raw = self._base.tokenize_text(processed)
125
-
126
- # Fix 8 post: restore placeholders
127
- tokens = restore_special_tokens(raw, specials)
128
-
129
- # Fix 1 & 2 post
130
- tokens = postprocess(tokens, caps_map, apo_splits)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
 
132
  # Fix 3 + 5: BPEβ†’SUFFIX reclassification + PUNCT
133
  tokens = reclassify_bpe_suffixes(tokens)
 
37
  from ._medical_vocab import ALL_DOMAIN_ROOTS
38
  from ._tdk_vocab import reclassify_foreign_words
39
  from ._normalizer import (
40
+ find_special_spans,
41
+ make_special_tokens,
42
  reclassify_numbers_in_tokens,
43
  )
44
  from ._allomorph import add_canonical_labels
 
114
  Returns a list of token dicts, each with:
115
  ``token``, ``token_type``, ``morph_pos``, and optional ``_*`` fields.
116
  """
117
+ # Fix 8: detect special tokens (NUM, DATE, URL, MENTION, HASHTAG, …)
118
+ # and split text into segments so they never enter the base tokenizer.
119
+ spans = find_special_spans(text)
120
+
121
+ tokens: list[dict] = []
122
+ pos = 0
123
+
124
+ for start, end, ttype, original in spans:
125
+ # Tokenize normal text before this special span
126
+ if pos < start:
127
+ segment = text[pos:start]
128
+ if segment.strip():
129
+ seg_proc, caps, apo = preprocess(segment)
130
+ seg_raw = self._base.tokenize_text(seg_proc)
131
+ seg_tokens = postprocess(seg_raw, caps, apo)
132
+ tokens.extend(seg_tokens)
133
+
134
+ # Insert the special token(s) directly
135
+ tokens.extend(make_special_tokens(ttype, original))
136
+ pos = end
137
+
138
+ # Tokenize remaining text after the last special span
139
+ if pos < len(text):
140
+ segment = text[pos:]
141
+ if segment.strip():
142
+ seg_proc, caps, apo = preprocess(segment)
143
+ seg_raw = self._base.tokenize_text(seg_proc)
144
+ seg_tokens = postprocess(seg_raw, caps, apo)
145
+ tokens.extend(seg_tokens)
146
 
147
  # Fix 3 + 5: BPEβ†’SUFFIX reclassification + PUNCT
148
  tokens = reclassify_bpe_suffixes(tokens)