| """Fix 8: Special token normalization (NUM, DATE, URL, MENTION, HASHTAG, EMOJI). |
| |
| Uses a segment-based approach: special tokens are detected and extracted |
| *before* the base tokenizer runs, so they never pass through it. |
| """ |
|
|
| from __future__ import annotations |
|
|
| import re |
|
|
| MONTH_NAMES = { |
| "ocak","Εubat","mart","nisan","mayΔ±s","haziran", |
| "temmuz","aΔustos","eylΓΌl","ekim","kasΔ±m","aralΔ±k", |
| "january","february","march","april","may","june", |
| "july","august","september","october","november","december", |
| } |
|
|
| UNITS = { |
| "km","m","cm","mm","nm", |
| "kg","g","mg","ton", |
| "sn","dk","sa","ms", |
| "tl","usd","eur","gbp", |
| "kb","mb","gb","tb","pb", |
| "ml","mcg","meq","iu","mmhg","mosm", |
| "hz","mhz","ghz","watt","kw","mw","kcal","cal", |
| } |
|
|
| ROMAN_NUMERALS = { |
| "i","ii","iii","iv","vi","vii","viii","ix", |
| "xi","xii","xiii","xiv","xv","xvi","xvii","xviii","xix","xx", |
| } |
|
|
| |
|
|
| URL_RE = re.compile(r'https?://\S+|www\.\S+', re.IGNORECASE) |
| MENTION_RE = re.compile(r'@[\w\u00C0-\u024F]+') |
| HASHTAG_RE = re.compile(r'#[\w\u00C0-\u024F]+') |
|
|
| |
| _NUM_SUFFIXES = sorted( |
| [ |
| "nΔ±n","nin","nun","nΓΌn","dan","den","tan","ten", |
| "da","de","ta","te","ya","ye","nda","nde", |
| "yΔ±","yi","yu","yΓΌ","nΔ±","ni","nu","nΓΌ", |
| "lar","ler","lara","lere","larΔ±","leri", |
| "Δ±m","im","um","ΓΌm","Δ±n","in","un","ΓΌn", |
| "mΔ±z","miz","muz","mΓΌz","nΔ±z","niz","nuz","nΓΌz", |
| "dΔ±r","dir","dur","dΓΌr","tΔ±r","tir","tur","tΓΌr", |
| "ki","li","lΔ±","lu","lΓΌ","sΔ±z","siz","suz","sΓΌz", |
| "inci","Δ±ncΔ±","uncu","ΓΌncΓΌ","nci","ncΔ±", |
| "lΔ±k","lik","luk","lΓΌk", |
| "a","e","Δ±","i","u","ΓΌ", |
| ], |
| key=len, |
| reverse=True, |
| ) |
|
|
| _SUFFIX_ALT = '|'.join(re.escape(s) for s in _NUM_SUFFIXES) |
|
|
| |
| NUM_APOSTROPHE_RE = re.compile( |
| r"\d+(?:[.:,]\d+)*['\u2019](?:" + _SUFFIX_ALT + r")+\b", |
| re.IGNORECASE, |
| ) |
|
|
| DATE_RE = re.compile( |
| r'\d{1,2}[./\-]\d{1,2}[./\-]\d{2,4}' |
| r'|\d{4}[./\-]\d{1,2}[./\-]\d{1,2}' |
| ) |
| CURRENCY_RE = re.compile(r'[$β¬Β£Β₯βΊβ½]\d+[\.,]?\d*|\d+[\.,]?\d*[$β¬Β£Β₯βΊβ½]') |
| NUMBER_RE = re.compile( |
| r'%\d+[\.,]?\d*' |
| r'|\d{1,3}(?:\.\d{3})+' |
| r'|\d+[\.,]\d+' |
| r'|\d+%' |
| r'|\d+/\d+' |
| ) |
| TIME_RE = re.compile(r'\d{1,2}:\d{2}(?::\d{2})?') |
| PLAIN_NUM_RE = re.compile(r'\b\d+\b') |
|
|
| |
| |
| |
| |
| |
| ACRONYM_RE = re.compile( |
| r"\b[A-ZΓΔΔ°ΓΕΓ]{2,}[0-9]*\b" |
| r"|\b[A-ZΓΔΔ°ΓΕΓ][0-9]+\b" |
| ) |
|
|
| |
| ACRONYM_APOSTROPHE_RE = re.compile( |
| r"\b(?:[A-ZΓΔΔ°ΓΕΓ]{2,}[0-9]*|[A-ZΓΔΔ°ΓΕΓ][0-9]+)['\u2019](?:" |
| + _SUFFIX_ALT + r")+\b" |
| ) |
|
|
| TEXT_EMOJI_RE = re.compile(r'[:;=]-?[\)\(\]\[dDpPoO3]|<3') |
| UNICODE_EMOJI_RE = re.compile( |
| "[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF" |
| "\U0001F680-\U0001F6FF\U0001F1E0-\U0001F1FF" |
| "\U00002700-\U000027BF\U0001F900-\U0001F9FF" |
| "\U00002600-\U000026FF]+", |
| flags=re.UNICODE, |
| ) |
|
|
| |
| _SPAN_PATTERNS: list[tuple[re.Pattern, str]] = [ |
| (URL_RE, "URL"), |
| (MENTION_RE, "MENTION"), |
| (HASHTAG_RE, "HASHTAG"), |
| (DATE_RE, "DATE"), |
| (CURRENCY_RE, "UNIT"), |
| (NUM_APOSTROPHE_RE, "NUM_APO"), |
| (ACRONYM_APOSTROPHE_RE, "ACRONYM_APO"), |
| (ACRONYM_RE, "ACRONYM"), |
| (NUMBER_RE, "NUM"), |
| (TIME_RE, "NUM"), |
| (PLAIN_NUM_RE, "NUM"), |
| (UNICODE_EMOJI_RE, "EMOJI"), |
| (TEXT_EMOJI_RE, "EMOJI"), |
| ] |
|
|
|
|
| |
|
|
| def _is_known_turkish_word(word_upper: str) -> bool: |
| """Return True if *word_upper* (ALL CAPS) is a known Turkish word. |
| |
| Checks (in order): |
| 1. ACRONYM_EXPANSIONS dict β always acronym (return False) |
| 2. Same dict without trailing digits (HTML5 β HTML) |
| 3. TDK dictionary β Turkish word (return True) |
| 4. Proper nouns list β Turkish word (return True) |
| 5. Otherwise β treat as acronym (return False) |
| """ |
| from ._acronym_dict import ACRONYM_EXPANSIONS |
| from ._preprocessor import _turkish_lower, _load_proper_nouns |
| from ._tdk_vocab import load_tdk_words |
|
|
| |
| if word_upper in ACRONYM_EXPANSIONS: |
| return False |
| |
| base = word_upper.rstrip("0123456789") |
| if base and base != word_upper and base in ACRONYM_EXPANSIONS: |
| return False |
|
|
| wl = _turkish_lower(word_upper) |
|
|
| |
| tdk = load_tdk_words() |
| if tdk and wl in tdk: |
| return True |
|
|
| |
| if wl in _load_proper_nouns(): |
| return True |
|
|
| return False |
|
|
|
|
| |
|
|
| def find_special_spans(text: str) -> list[tuple[int, int, str, str]]: |
| """Find all special-token spans in *text*. |
| |
| Returns a sorted, non-overlapping list of |
| ``(start, end, token_type, original_text)``. |
| """ |
| candidates: list[tuple[int, int, str, str]] = [] |
| for pattern, ttype in _SPAN_PATTERNS: |
| for m in pattern.finditer(text): |
| original = m.group(0) |
|
|
| |
| if ttype in ("ACRONYM", "ACRONYM_APO"): |
| |
| if ttype == "ACRONYM_APO": |
| apo = original.find("'") |
| if apo == -1: |
| apo = original.find("\u2019") |
| acr_base = original[:apo] |
| else: |
| acr_base = original |
| if _is_known_turkish_word(acr_base): |
| continue |
|
|
| candidates.append((m.start(), m.end(), ttype, original)) |
|
|
| |
| candidates.sort(key=lambda x: (x[0], -(x[1] - x[0]))) |
|
|
| |
| result: list[tuple[int, int, str, str]] = [] |
| last_end = 0 |
| for s, e, t, o in candidates: |
| if s >= last_end: |
| result.append((s, e, t, o)) |
| last_end = e |
| return result |
|
|
|
|
| def _split_apostrophe_suffixes(suffix_str: str) -> list[dict]: |
| """Split a suffix string (after apostrophe) into individual SUFFIX tokens.""" |
| tokens: list[dict] = [] |
| remaining = suffix_str.lower() |
| while remaining: |
| matched = False |
| for s in _NUM_SUFFIXES: |
| if remaining.startswith(s): |
| tokens.append({"token": s, "type": "SUFFIX", "_apo_suffix": True}) |
| remaining = remaining[len(s):] |
| matched = True |
| break |
| if not matched: |
| tokens.append({"token": remaining, "type": "SUFFIX", "_apo_suffix": True}) |
| break |
| return tokens |
|
|
|
|
| def make_special_tokens(span_type: str, original: str) -> list[dict]: |
| """Create token dict(s) for a matched special span. |
| |
| ``NUM_APO`` and ``ACRONYM_APO`` spans are split into base + SUFFIX tokens. |
| """ |
| |
| if span_type == "NUM_APO": |
| apo_pos = original.find("'") |
| if apo_pos == -1: |
| apo_pos = original.find("\u2019") |
| num_part = original[:apo_pos] |
| return [ |
| {"token": f" {num_part}", "type": "NUM", "_num": True}, |
| *_split_apostrophe_suffixes(original[apo_pos + 1:]), |
| ] |
|
|
| |
| if span_type == "ACRONYM_APO": |
| apo_pos = original.find("'") |
| if apo_pos == -1: |
| apo_pos = original.find("\u2019") |
| acr_part = original[:apo_pos] |
| return [ |
| {"token": f" {acr_part}", "type": "ACRONYM", "_acronym": True}, |
| *_split_apostrophe_suffixes(original[apo_pos + 1:]), |
| ] |
|
|
| |
| if span_type == "ACRONYM": |
| return [{"token": f" {original}", "type": "ACRONYM", "_acronym": True}] |
|
|
| |
| return [{ |
| "token": f" {original}", |
| "type": span_type, |
| f"_{span_type.lower()}": True, |
| }] |
|
|
|
|
| |
|
|
| def reclassify_numbers_in_tokens(tokens: list[dict]) -> list[dict]: |
| """Catch remaining number/unit tokens missed by span detection.""" |
| result: list[dict] = [] |
| for tok in tokens: |
| if tok["type"] not in ("BPE", "ROOT"): |
| result.append(tok) |
| continue |
|
|
| raw = tok["token"].strip() |
|
|
| if NUMBER_RE.fullmatch(raw): |
| result.append({**tok, "type": "NUM", "_num": True}) |
| elif raw.lower() in UNITS and tok["type"] == "BPE": |
| result.append({**tok, "type": "UNIT", "_unit": True}) |
| elif raw.lower() in ROMAN_NUMERALS and tok["type"] == "BPE": |
| result.append({**tok, "type": "NUM", "_roman": True}) |
| elif raw.lower() in MONTH_NAMES and tok["type"] == "BPE": |
| result.append({**tok, "type": "ROOT", "_month": True}) |
| else: |
| result.append(tok) |
|
|
| return result |
|
|