Initial release: TurkTokenizer v1.0.0 — TR-MMLU 92%
Browse files- .gitattributes +1 -0
- README.md +140 -0
- pyproject.toml +39 -0
- tokenizer_config.json +10 -0
- turk_tokenizer/__init__.py +21 -0
- turk_tokenizer/_acronym_dict.py +95 -0
- turk_tokenizer/_allomorph.py +46 -0
- turk_tokenizer/_compound.py +77 -0
- turk_tokenizer/_context_aware.py +60 -0
- turk_tokenizer/_java_check.py +57 -0
- turk_tokenizer/_medical_vocab.py +139 -0
- turk_tokenizer/_normalizer.py +128 -0
- turk_tokenizer/_preprocessor.py +163 -0
- turk_tokenizer/_root_validator.py +206 -0
- turk_tokenizer/_suffix_expander.py +212 -0
- turk_tokenizer/_tdk_vocab.py +90 -0
- turk_tokenizer/data/zemberek-full.jar +3 -0
- turk_tokenizer/tokenizer.py +308 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
turk_tokenizer/data/zemberek-full.jar filter=lfs diff=lfs merge=lfs -text
|
README.md
ADDED
|
@@ -0,0 +1,140 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
language:
|
| 3 |
+
- tr
|
| 4 |
+
tags:
|
| 5 |
+
- tokenizer
|
| 6 |
+
- morphology
|
| 7 |
+
- turkish
|
| 8 |
+
- nlp
|
| 9 |
+
license: mit
|
| 10 |
+
library_name: turk-tokenizer
|
| 11 |
+
---
|
| 12 |
+
|
| 13 |
+
# TurkTokenizer
|
| 14 |
+
|
| 15 |
+
**Turkish morphological tokenizer — TR-MMLU world record 92%**
|
| 16 |
+
|
| 17 |
+
TurkTokenizer performs linguistically-aware tokenization of Turkish text using morphological rules. Unlike BPE-based tokenizers, it produces meaningful morphological units (roots and suffixes) aligned with Turkish grammar.
|
| 18 |
+
|
| 19 |
+
## Installation
|
| 20 |
+
|
| 21 |
+
```bash
|
| 22 |
+
pip install git+https://huggingface.co/Ethosoft/turk-tokenizer
|
| 23 |
+
```
|
| 24 |
+
|
| 25 |
+
**Java is required** (for Zemberek morphological analysis):
|
| 26 |
+
|
| 27 |
+
| OS | Command |
|
| 28 |
+
|---|---|
|
| 29 |
+
| Ubuntu / Debian | `sudo apt install default-jre` |
|
| 30 |
+
| Fedora / RHEL | `sudo dnf install java-latest-openjdk` |
|
| 31 |
+
| macOS | `brew install openjdk` |
|
| 32 |
+
| Windows | `winget install Microsoft.OpenJDK.21` |
|
| 33 |
+
|
| 34 |
+
## Quick Start
|
| 35 |
+
|
| 36 |
+
```python
|
| 37 |
+
from turk_tokenizer import TurkTokenizer
|
| 38 |
+
|
| 39 |
+
tok = TurkTokenizer()
|
| 40 |
+
tokens = tok("İstanbul'da meeting'e katılamadım")
|
| 41 |
+
|
| 42 |
+
for t in tokens:
|
| 43 |
+
print(t["token"], t["token_type"], t["morph_pos"])
|
| 44 |
+
```
|
| 45 |
+
|
| 46 |
+
Output:
|
| 47 |
+
```
|
| 48 |
+
<uppercase_word> ROOT 0
|
| 49 |
+
istanbul ROOT 0
|
| 50 |
+
da SUFFIX 1
|
| 51 |
+
meeting FOREIGN 0
|
| 52 |
+
e SUFFIX 1
|
| 53 |
+
katılama ROOT 0
|
| 54 |
+
dı SUFFIX 1
|
| 55 |
+
m SUFFIX 2
|
| 56 |
+
```
|
| 57 |
+
|
| 58 |
+
## Output Fields
|
| 59 |
+
|
| 60 |
+
Each token is a dict with the following guaranteed fields:
|
| 61 |
+
|
| 62 |
+
| Field | Type | Description |
|
| 63 |
+
|---|---|---|
|
| 64 |
+
| `token` | `str` | Token string (leading space = word-initial) |
|
| 65 |
+
| `token_type` | `str` | See types below |
|
| 66 |
+
| `morph_pos` | `int` | `0` = root/word-initial, `1` = first suffix, `2` = second… |
|
| 67 |
+
|
| 68 |
+
### Token Types
|
| 69 |
+
|
| 70 |
+
| Type | Description |
|
| 71 |
+
|---|---|
|
| 72 |
+
| `ROOT` | Turkish root word |
|
| 73 |
+
| `SUFFIX` | Turkish morphological suffix |
|
| 74 |
+
| `FOREIGN` | Foreign/loanword root (e.g. "meeting", "zoom") |
|
| 75 |
+
| `BPE` | Unknown subword (fallback) |
|
| 76 |
+
| `PUNCT` | Punctuation mark |
|
| 77 |
+
| `NUM` | Number |
|
| 78 |
+
| `DATE` | Date |
|
| 79 |
+
| `UNIT` | Measurement unit |
|
| 80 |
+
| `URL` | Web URL |
|
| 81 |
+
| `MENTION` | @username |
|
| 82 |
+
| `HASHTAG` | #topic |
|
| 83 |
+
| `EMOJI` | Emoji |
|
| 84 |
+
|
| 85 |
+
### Optional Metadata Fields
|
| 86 |
+
|
| 87 |
+
| Field | Description |
|
| 88 |
+
|---|---|
|
| 89 |
+
| `_canonical` | Canonical morpheme ID (e.g. `"PL"`, `"ACC"`, `"DAT"`) |
|
| 90 |
+
| `_suffix_label` | Detailed morphological label (e.g. `"-PL+ACC"`) |
|
| 91 |
+
| `_foreign` | `True` if foreign root |
|
| 92 |
+
| `_caps` | `True` if originally ALL CAPS |
|
| 93 |
+
| `_domain` | `True` if medical/sports/tourism domain |
|
| 94 |
+
| `_compound` | `True` if compound word |
|
| 95 |
+
| `_parts` | Compound word parts |
|
| 96 |
+
| `_expansion` | Acronym expansion (e.g. `"CMV"` → `"Sitomegalovirüs"`) |
|
| 97 |
+
| `_pos` | POS tag from Zemberek (Noun, Verb, Adj…) |
|
| 98 |
+
| `_lemma` | Lemma from Zemberek |
|
| 99 |
+
| `_disambiguated` | `True` if context disambiguation was applied |
|
| 100 |
+
| `_root_corrected` | `True` if root was corrected by Zemberek |
|
| 101 |
+
|
| 102 |
+
## Batch Tokenization
|
| 103 |
+
|
| 104 |
+
```python
|
| 105 |
+
texts = ["Ankara'da kar yağıyor.", "Meeting'e katılacak mısın?"]
|
| 106 |
+
results = tok.batch_tokenize(texts, workers=4)
|
| 107 |
+
```
|
| 108 |
+
|
| 109 |
+
## Statistics
|
| 110 |
+
|
| 111 |
+
```python
|
| 112 |
+
tokens = tok("Türk dili zengin bir morfolojiye sahiptir.")
|
| 113 |
+
s = tok.stats(tokens)
|
| 114 |
+
print(f"TR coverage: {s['tr_pct']}%")
|
| 115 |
+
```
|
| 116 |
+
|
| 117 |
+
## Morphological Fixes Applied
|
| 118 |
+
|
| 119 |
+
1. **ALL CAPS** — `"İSTANBUL"` → 2 tokens instead of 16
|
| 120 |
+
2. **Apostrophe splitting** — `"meeting'e"` → `[meeting:FOREIGN][e:SUFFIX]`
|
| 121 |
+
3. **BPE→SUFFIX** — 260+ suffix patterns reclassified
|
| 122 |
+
4. **Zemberek root validation** — phonetic root correction (`"gök"` → `"göğüs"`)
|
| 123 |
+
5. **Punctuation** — classified as PUNCT (counted in TR coverage)
|
| 124 |
+
6. **Domain vocabulary** — 500+ medical/sports/tourism roots
|
| 125 |
+
7. **TDK FOREIGN detection** — 76K+ Turkish words used as reference
|
| 126 |
+
8. **Special token normalization** — NUM, DATE, URL, MENTION, HASHTAG, EMOJI
|
| 127 |
+
9. **Allomorph canonicalization** — `"lar"/"ler"` → `PL`, `"dan"/"den"` → `ABL`
|
| 128 |
+
10. **Compound decomposition** — `"başbakan"` → `["baş", "bakan"]`
|
| 129 |
+
11. **Acronym expansion** — `"CMV"` → `"Sitomegalovirüs"`
|
| 130 |
+
12. **Context disambiguation** — Zemberek sentence-level POS selection
|
| 131 |
+
|
| 132 |
+
## Benchmark
|
| 133 |
+
|
| 134 |
+
| Benchmark | Score |
|
| 135 |
+
|---|---|
|
| 136 |
+
| TR-MMLU | **92%** (world record) |
|
| 137 |
+
|
| 138 |
+
## License
|
| 139 |
+
|
| 140 |
+
MIT
|
pyproject.toml
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[build-system]
|
| 2 |
+
requires = ["setuptools>=61", "wheel"]
|
| 3 |
+
build-backend = "setuptools.backends.legacy:build"
|
| 4 |
+
|
| 5 |
+
[project]
|
| 6 |
+
name = "turk-tokenizer"
|
| 7 |
+
version = "1.0.0"
|
| 8 |
+
description = "Turkish morphological tokenizer — TR-MMLU world record %92"
|
| 9 |
+
readme = "README.md"
|
| 10 |
+
license = { text = "MIT" }
|
| 11 |
+
authors = [{ name = "Ethosoft", email = "info@ethosoft.ai" }]
|
| 12 |
+
requires-python = ">=3.10"
|
| 13 |
+
keywords = ["turkish", "nlp", "tokenizer", "morphology", "huggingface"]
|
| 14 |
+
classifiers = [
|
| 15 |
+
"Programming Language :: Python :: 3",
|
| 16 |
+
"License :: OSI Approved :: MIT License",
|
| 17 |
+
"Operating System :: OS Independent",
|
| 18 |
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
| 19 |
+
"Natural Language :: Turkish",
|
| 20 |
+
]
|
| 21 |
+
dependencies = [
|
| 22 |
+
"turkish-tokenizer>=0.1.0",
|
| 23 |
+
"jpype1>=1.4.0",
|
| 24 |
+
"requests>=2.28.0",
|
| 25 |
+
]
|
| 26 |
+
|
| 27 |
+
[project.optional-dependencies]
|
| 28 |
+
dev = ["pytest", "huggingface_hub"]
|
| 29 |
+
|
| 30 |
+
[project.urls]
|
| 31 |
+
Homepage = "https://huggingface.co/Ethosoft/turk-tokenizer"
|
| 32 |
+
Repository = "https://huggingface.co/Ethosoft/turk-tokenizer"
|
| 33 |
+
|
| 34 |
+
[tool.setuptools.packages.find]
|
| 35 |
+
where = ["."]
|
| 36 |
+
include = ["turk_tokenizer*"]
|
| 37 |
+
|
| 38 |
+
[tool.setuptools.package-data]
|
| 39 |
+
turk_tokenizer = ["data/*.jar"]
|
tokenizer_config.json
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"tokenizer_class": "TurkTokenizer",
|
| 3 |
+
"model_type": "turk-tokenizer",
|
| 4 |
+
"version": "1.0.0",
|
| 5 |
+
"description": "Turkish morphological tokenizer — TR-MMLU world record 92%",
|
| 6 |
+
"language": "tr",
|
| 7 |
+
"authors": "Ethosoft",
|
| 8 |
+
"requires_java": true,
|
| 9 |
+
"dependencies": ["turkish-tokenizer", "jpype1"]
|
| 10 |
+
}
|
turk_tokenizer/__init__.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
TurkTokenizer — Turkish morphological tokenizer.
|
| 3 |
+
TR-MMLU world record: 92%
|
| 4 |
+
|
| 5 |
+
Usage:
|
| 6 |
+
from turk_tokenizer import TurkTokenizer
|
| 7 |
+
|
| 8 |
+
tok = TurkTokenizer()
|
| 9 |
+
tokens = tok("İstanbul'da meeting'e katılamadım")
|
| 10 |
+
|
| 11 |
+
# Each token dict contains:
|
| 12 |
+
# token : str — token string (with leading space if word-initial)
|
| 13 |
+
# token_type : str — ROOT | SUFFIX | FOREIGN | BPE | PUNCT |
|
| 14 |
+
# NUM | DATE | UNIT | URL | MENTION | HASHTAG | EMOJI
|
| 15 |
+
# morph_pos : int — 0=root/word-initial, 1=first suffix, 2=second...
|
| 16 |
+
"""
|
| 17 |
+
|
| 18 |
+
from .tokenizer import TurkTokenizer
|
| 19 |
+
|
| 20 |
+
__all__ = ["TurkTokenizer"]
|
| 21 |
+
__version__ = "1.0.0"
|
turk_tokenizer/_acronym_dict.py
ADDED
|
@@ -0,0 +1,95 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Fix 11: Acronym/abbreviation expansion dictionary."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
ACRONYM_EXPANSIONS: dict[str, str] = {
|
| 6 |
+
# International organizations
|
| 7 |
+
"NATO": "Kuzey Atlantik Antlaşması Örgütü",
|
| 8 |
+
"UN": "Birleşmiş Milletler",
|
| 9 |
+
"UNESCO": "BM Eğitim, Bilim ve Kültür Örgütü",
|
| 10 |
+
"UNICEF": "BM Çocuklara Yardım Fonu",
|
| 11 |
+
"WHO": "Dünya Sağlık Örgütü",
|
| 12 |
+
"IMF": "Uluslararası Para Fonu",
|
| 13 |
+
"WTO": "Dünya Ticaret Örgütü",
|
| 14 |
+
"EU": "Avrupa Birliği",
|
| 15 |
+
"INTERPOL": "Uluslararası Kriminal Polis Örgütü",
|
| 16 |
+
"FIFA": "Uluslararası Futbol Federasyonları Birliği",
|
| 17 |
+
"IOC": "Uluslararası Olimpiyat Komitesi",
|
| 18 |
+
"UEFA": "Avrupa Futbol Birliği",
|
| 19 |
+
# Turkish institutions
|
| 20 |
+
"TBMM": "Türkiye Büyük Millet Meclisi",
|
| 21 |
+
"MEB": "Milli Eğitim Bakanlığı",
|
| 22 |
+
"TDK": "Türk Dil Kurumu",
|
| 23 |
+
"TTK": "Türk Tarih Kurumu",
|
| 24 |
+
"TCMB": "Türkiye Cumhuriyet Merkez Bankası",
|
| 25 |
+
"BDDK": "Bankacılık Düzenleme ve Denetleme Kurumu",
|
| 26 |
+
"SPK": "Sermaye Piyasası Kurulu",
|
| 27 |
+
"SGK": "Sosyal Güvenlik Kurumu",
|
| 28 |
+
"KDV": "Katma Değer Vergisi",
|
| 29 |
+
"ÖTV": "Özel Tüketim Vergisi",
|
| 30 |
+
"ÖSYM": "Ölçme, Seçme ve Yerleştirme Merkezi",
|
| 31 |
+
"YÖK": "Yükseköğretim Kurulu",
|
| 32 |
+
"TÜİK": "Türkiye İstatistik Kurumu",
|
| 33 |
+
"TÜBİTAK": "Türkiye Bilimsel ve Teknolojik Araştırma Kurumu",
|
| 34 |
+
"ASELSAN": "Askeri Elektronik Sanayii",
|
| 35 |
+
# Turkish exams
|
| 36 |
+
"TUS": "Tıpta Uzmanlık Sınavı",
|
| 37 |
+
"DUS": "Diş Hekimliğinde Uzmanlık Sınavı",
|
| 38 |
+
"YDUS": "Yabancı Dil Uzmanlık Sınavı",
|
| 39 |
+
"KPSS": "Kamu Personeli Seçme Sınavı",
|
| 40 |
+
# Medical
|
| 41 |
+
"CMV": "Sitomegalovirüs", "EBV": "Epstein-Barr Virüsü",
|
| 42 |
+
"VZV": "Varisella-Zoster Virüsü", "HHV": "İnsan Herpes Virüsü",
|
| 43 |
+
"HSV": "Herpes Simplex Virüsü", "HIV": "İnsan İmmün Yetmezlik Virüsü",
|
| 44 |
+
"HBV": "Hepatit B Virüsü", "HCV": "Hepatit C Virüsü",
|
| 45 |
+
"RSV": "Respiratuar Sinsisyal Virüs", "HPV": "İnsan Papilloma Virüsü",
|
| 46 |
+
"HAV": "Hepatit A Virüsü",
|
| 47 |
+
"SLE": "Sistemik Lupus Eritematozus",
|
| 48 |
+
"COPD": "Kronik Obstrüktif Akciğer Hastalığı",
|
| 49 |
+
"DM": "Diabetes Mellitus", "HTN": "Hipertansiyon",
|
| 50 |
+
"MI": "Miyokard İnfarktüsü", "DVT": "Derin Ven Trombozu",
|
| 51 |
+
"PE": "Pulmoner Emboli",
|
| 52 |
+
"AML": "Akut Myeloid Lösemi", "CML": "Kronik Myeloid Lösemi",
|
| 53 |
+
"ALL": "Akut Lenfoblastik Lösemi", "CLL": "Kronik Lenfositik Lösemi",
|
| 54 |
+
"ECG": "Elektrokardiyogram", "EEG": "Elektroensefalogram",
|
| 55 |
+
"MRI": "Manyetik Rezonans Görüntüleme",
|
| 56 |
+
"CT": "Bilgisayarlı Tomografi", "USG": "Ultrasonografi",
|
| 57 |
+
"CBC": "Tam Kan Sayımı",
|
| 58 |
+
"INR": "Uluslararası Normalleştirilmiş Oran",
|
| 59 |
+
"LDL": "Düşük Yoğunluklu Lipoprotein",
|
| 60 |
+
"HDL": "Yüksek Yoğunluklu Lipoprotein",
|
| 61 |
+
"SMMM": "Serbest Muhasebeci Mali Müşavir",
|
| 62 |
+
"YMM": "Yeminli Mali Müşavir",
|
| 63 |
+
"SM": "Serbest Muhasebeci",
|
| 64 |
+
# Technology
|
| 65 |
+
"AI": "Yapay Zeka", "ML": "Makine Öğrenmesi",
|
| 66 |
+
"LLM": "Büyük Dil Modeli", "NLP": "Doğal Dil İşleme",
|
| 67 |
+
"API": "Uygulama Programlama Arayüzü",
|
| 68 |
+
"CPU": "Merkezi İşlem Birimi", "GPU": "Grafik İşlem Birimi",
|
| 69 |
+
"RAM": "Rastgele Erişim Belleği",
|
| 70 |
+
"SQL": "Yapılandırılmış Sorgu Dili",
|
| 71 |
+
"HTML": "HiperMetin İşaretleme Dili",
|
| 72 |
+
"CSS": "Basamaklı Stil Sayfaları",
|
| 73 |
+
"OS": "İşletim Sistemi",
|
| 74 |
+
"BERT": "Çift Yönlü Kodlayıcı Temsiller",
|
| 75 |
+
"GPT": "Üretici Önceden Eğitilmiş Dönüştürücü",
|
| 76 |
+
# Economics
|
| 77 |
+
"OPEC": "Petrol İhraç Eden Ülkeler Örgütü",
|
| 78 |
+
"NAFTA": "Kuzey Amerika Serbest Ticaret Anlaşması",
|
| 79 |
+
# Sports
|
| 80 |
+
"NBA": "Ulusal Basketbol Birliği",
|
| 81 |
+
"NFL": "Ulusal Futbol Ligi",
|
| 82 |
+
}
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
def reclassify_acronyms(tokens: list[dict]) -> list[dict]:
|
| 86 |
+
"""Add ``_expansion`` field to known acronyms in the token stream."""
|
| 87 |
+
result: list[dict] = []
|
| 88 |
+
for tok in tokens:
|
| 89 |
+
if tok["type"] == "ROOT" and (tok.get("_acronym") or tok.get("_caps")):
|
| 90 |
+
expansion = ACRONYM_EXPANSIONS.get(tok["token"].strip().upper())
|
| 91 |
+
if expansion:
|
| 92 |
+
result.append({**tok, "_expansion": expansion, "_known_acronym": True})
|
| 93 |
+
continue
|
| 94 |
+
result.append(tok)
|
| 95 |
+
return result
|
turk_tokenizer/_allomorph.py
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Fix 9: Allomorph canonicalization — map surface forms to morpheme IDs."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
ALLOMORPH_MAP: dict[str, str] = {
|
| 6 |
+
"lar": "PL", "ler": "PL",
|
| 7 |
+
"ı": "ACC", "i": "ACC", "u": "ACC", "ü": "ACC",
|
| 8 |
+
"yı": "ACC", "yi": "ACC", "yu": "ACC", "yü": "ACC",
|
| 9 |
+
"a": "DAT", "e": "DAT", "ya": "DAT", "ye": "DAT",
|
| 10 |
+
"da": "LOC", "de": "LOC", "ta": "LOC", "te": "LOC",
|
| 11 |
+
"dan": "ABL", "den": "ABL", "tan": "ABL", "ten": "ABL",
|
| 12 |
+
"ın": "GEN", "in": "GEN", "un": "GEN", "ün": "GEN",
|
| 13 |
+
"nın": "GEN", "nin": "GEN", "nun": "GEN", "nün": "GEN",
|
| 14 |
+
"la": "INS", "le": "INS", "yla": "INS", "yle": "INS",
|
| 15 |
+
"dı": "PAST", "di": "PAST", "du": "PAST", "dü": "PAST",
|
| 16 |
+
"tı": "PAST", "ti": "PAST", "tu": "PAST", "tü": "PAST",
|
| 17 |
+
"yor": "PROG",
|
| 18 |
+
"ar": "AOR", "er": "AOR",
|
| 19 |
+
"ır": "AOR", "ir": "AOR", "ur": "AOR", "ür": "AOR",
|
| 20 |
+
"mış": "EVID", "miş": "EVID", "muş": "EVID", "müş": "EVID",
|
| 21 |
+
"ma": "NEG", "me": "NEG",
|
| 22 |
+
"mak": "INF", "mek": "INF",
|
| 23 |
+
"ım": "1SG", "im": "1SG", "um": "1SG", "üm": "1SG",
|
| 24 |
+
"ın": "2SG", "in": "2SG", "un": "2SG", "ün": "2SG",
|
| 25 |
+
"iz": "1PL", "ız": "1PL", "uz": "1PL", "üz": "1PL",
|
| 26 |
+
"mı": "Q", "mi": "Q", "mu": "Q", "mü": "Q",
|
| 27 |
+
"lı": "WITH", "li": "WITH", "lu": "WITH", "lü": "WITH",
|
| 28 |
+
"sız": "WITHOUT","siz": "WITHOUT","suz": "WITHOUT","süz": "WITHOUT",
|
| 29 |
+
"cı": "AGT", "ci": "AGT", "cu": "AGT", "cü": "AGT",
|
| 30 |
+
"çı": "AGT", "çi": "AGT", "çu": "AGT", "çü": "AGT",
|
| 31 |
+
"lık": "ABSTR","lik": "ABSTR","luk": "ABSTR","lük": "ABSTR",
|
| 32 |
+
"sa": "COND", "se": "COND",
|
| 33 |
+
"ıl": "PASS", "il": "PASS", "ul": "PASS", "ül": "PASS",
|
| 34 |
+
}
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def add_canonical_labels(tokens: list[dict]) -> list[dict]:
|
| 38 |
+
"""Add ``_canonical`` field to SUFFIX tokens (e.g. 'lar'/'ler' → 'PL')."""
|
| 39 |
+
result: list[dict] = []
|
| 40 |
+
for tok in tokens:
|
| 41 |
+
if tok["type"] != "SUFFIX":
|
| 42 |
+
result.append(tok)
|
| 43 |
+
continue
|
| 44 |
+
canonical = ALLOMORPH_MAP.get(tok["token"].strip().lower())
|
| 45 |
+
result.append({**tok, "_canonical": canonical} if canonical else tok)
|
| 46 |
+
return result
|
turk_tokenizer/_compound.py
ADDED
|
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Fix 10: Turkish compound word annotation."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
KNOWN_COMPOUNDS: dict[str, list[str]] = {
|
| 6 |
+
"başbakan": ["baş", "bakan"],
|
| 7 |
+
"cumhurbaşkanı": ["cumhur", "başkan"],
|
| 8 |
+
"dışişleri": ["dış", "iş"],
|
| 9 |
+
"içişleri": ["iç", "iş"],
|
| 10 |
+
"maliye": ["mal", "iye"],
|
| 11 |
+
"belediye": ["beled", "iye"],
|
| 12 |
+
"ayakkabı": ["ayak", "kap"],
|
| 13 |
+
"yelkovan": ["yel", "kovan"],
|
| 14 |
+
"saatlik": ["saat", "lik"],
|
| 15 |
+
"günlük": ["gün", "lük"],
|
| 16 |
+
"yıllık": ["yıl", "lık"],
|
| 17 |
+
"aylık": ["ay", "lık"],
|
| 18 |
+
"haftalık": ["hafta", "lık"],
|
| 19 |
+
"gastrointestinal": ["gastro", "intestinal"],
|
| 20 |
+
"kardiyovasküler": ["kardio", "vasküler"],
|
| 21 |
+
"nöropsikiyatri": ["nöro", "psikiyatri"],
|
| 22 |
+
"biyokimya": ["biyo", "kimya"],
|
| 23 |
+
"mikrobiyoloji": ["mikro", "biyoloji"],
|
| 24 |
+
"farmakoloji": ["farma", "koloji"],
|
| 25 |
+
"patoloji": ["pato", "loji"],
|
| 26 |
+
"hematoloji": ["hemato", "loji"],
|
| 27 |
+
"nefroloji": ["nefro", "loji"],
|
| 28 |
+
"kardiyoloji": ["kardio", "loji"],
|
| 29 |
+
"radyoloji": ["radyo", "loji"],
|
| 30 |
+
"onkoloji": ["onko", "loji"],
|
| 31 |
+
"elektromanyetik": ["elektro", "manyetik"],
|
| 32 |
+
"termodinamik": ["termo", "dinamik"],
|
| 33 |
+
"hidroelektrik": ["hidro", "elektrik"],
|
| 34 |
+
"biyoinformatik": ["biyo", "informatik"],
|
| 35 |
+
"nanoteknoloji": ["nano", "teknoloji"],
|
| 36 |
+
"futbolcu": ["futbol", "cu"],
|
| 37 |
+
"basketbolcu": ["basketbol", "cu"],
|
| 38 |
+
"voleybolcu": ["voleybol", "cu"],
|
| 39 |
+
}
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
def _decompose_zemberek(word: str, morphology) -> list[str] | None:
|
| 43 |
+
try:
|
| 44 |
+
import jpype # noqa: PLC0415
|
| 45 |
+
wa = morphology.analyze(jpype.JString(word))
|
| 46 |
+
for sa in wa.getAnalysisResults():
|
| 47 |
+
morphemes = [str(m) for m in sa.getMorphemes()]
|
| 48 |
+
roots = [m for m in morphemes if "Noun" in m or "Verb" in m or "Adj" in m]
|
| 49 |
+
if len(roots) > 1:
|
| 50 |
+
return roots
|
| 51 |
+
except Exception: # noqa: BLE001
|
| 52 |
+
pass
|
| 53 |
+
return None
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
def add_compound_info(tokens: list[dict], morphology=None) -> list[dict]:
|
| 57 |
+
"""Annotate ROOT tokens that are compound words with ``_compound`` and ``_parts``."""
|
| 58 |
+
result: list[dict] = []
|
| 59 |
+
for tok in tokens:
|
| 60 |
+
if tok["type"] != "ROOT" or tok["token"].strip().startswith("<"):
|
| 61 |
+
result.append(tok)
|
| 62 |
+
continue
|
| 63 |
+
|
| 64 |
+
surface = tok["token"].strip().lower()
|
| 65 |
+
|
| 66 |
+
if morphology is not None:
|
| 67 |
+
parts = _decompose_zemberek(surface, morphology)
|
| 68 |
+
if parts and len(parts) > 1:
|
| 69 |
+
result.append({**tok, "_compound": True, "_parts": parts, "_source": "zemberek"})
|
| 70 |
+
continue
|
| 71 |
+
|
| 72 |
+
if surface in KNOWN_COMPOUNDS:
|
| 73 |
+
result.append({**tok, "_compound": True, "_parts": KNOWN_COMPOUNDS[surface], "_source": "manual"})
|
| 74 |
+
else:
|
| 75 |
+
result.append(tok)
|
| 76 |
+
|
| 77 |
+
return result
|
turk_tokenizer/_context_aware.py
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Fix 12: Context-aware Zemberek disambiguation."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
from ._root_validator import ZEMBEREK_AVAILABLE, _morphology, _jstr
|
| 6 |
+
|
| 7 |
+
AMBIGUOUS_WORDS = {
|
| 8 |
+
"yüz", "gelir", "yazar", "geçer", "çıkar", "gider",
|
| 9 |
+
"biter", "düşer", "tutar", "kalır", "gerekir", "uyar",
|
| 10 |
+
"uçar", "güzel", "büyük", "küçük", "yeni", "eski",
|
| 11 |
+
}
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def annotate_with_context(tokens: list[dict], original_text: str) -> list[dict]:
|
| 15 |
+
"""Enrich ROOT tokens with POS and lemma using Zemberek sentence-level disambiguation."""
|
| 16 |
+
if not ZEMBEREK_AVAILABLE:
|
| 17 |
+
return tokens
|
| 18 |
+
|
| 19 |
+
try:
|
| 20 |
+
sa_result = _morphology.analyzeAndDisambiguate(_jstr(original_text.strip()))
|
| 21 |
+
best_list = sa_result.bestAnalysis()
|
| 22 |
+
|
| 23 |
+
analyses: dict[str, dict] = {}
|
| 24 |
+
for idx in range(best_list.size()):
|
| 25 |
+
try:
|
| 26 |
+
sa = best_list.get(idx)
|
| 27 |
+
item = sa.getDictionaryItem()
|
| 28 |
+
sf = str(sa.surfaceForm()).lower().strip()
|
| 29 |
+
if sf not in analyses:
|
| 30 |
+
analyses[sf] = {
|
| 31 |
+
"lemma": str(item.lemma),
|
| 32 |
+
"pos": str(sa.getPos().shortForm),
|
| 33 |
+
"morphemes": [str(m) for m in sa.getMorphemes()],
|
| 34 |
+
}
|
| 35 |
+
except Exception: # noqa: BLE001
|
| 36 |
+
continue
|
| 37 |
+
|
| 38 |
+
result: list[dict] = []
|
| 39 |
+
for tok in tokens:
|
| 40 |
+
if tok["type"] != "ROOT" or tok["token"].strip().startswith("<"):
|
| 41 |
+
result.append(tok)
|
| 42 |
+
continue
|
| 43 |
+
|
| 44 |
+
surface = tok["token"].strip().lower()
|
| 45 |
+
z = analyses.get(surface)
|
| 46 |
+
if z:
|
| 47 |
+
result.append({
|
| 48 |
+
**tok,
|
| 49 |
+
"_pos": z["pos"],
|
| 50 |
+
"_lemma": z["lemma"],
|
| 51 |
+
"_morphemes": z["morphemes"],
|
| 52 |
+
"_disambiguated": surface in AMBIGUOUS_WORDS,
|
| 53 |
+
})
|
| 54 |
+
else:
|
| 55 |
+
result.append(tok)
|
| 56 |
+
|
| 57 |
+
return result
|
| 58 |
+
|
| 59 |
+
except Exception: # noqa: BLE001
|
| 60 |
+
return tokens
|
turk_tokenizer/_java_check.py
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Java/JVM presence check with actionable error messages."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import shutil
|
| 6 |
+
import subprocess
|
| 7 |
+
import sys
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def ensure_java() -> None:
|
| 11 |
+
"""Raise a clear RuntimeError if Java is not installed."""
|
| 12 |
+
if shutil.which("java") is not None:
|
| 13 |
+
return
|
| 14 |
+
|
| 15 |
+
# Try jpype's own detection as a fallback
|
| 16 |
+
try:
|
| 17 |
+
import jpype # noqa: PLC0415
|
| 18 |
+
jpype.getDefaultJVMPath()
|
| 19 |
+
return
|
| 20 |
+
except Exception: # noqa: BLE001
|
| 21 |
+
pass
|
| 22 |
+
|
| 23 |
+
_install_cmd = _get_install_cmd()
|
| 24 |
+
raise RuntimeError(
|
| 25 |
+
"\n"
|
| 26 |
+
"╔══════════════════════════════════════════════════════════════╗\n"
|
| 27 |
+
"║ TurkTokenizer requires Java (JVM) — not found on this system ║\n"
|
| 28 |
+
"╠══════════════════════════════════════════════════════════════╣\n"
|
| 29 |
+
f"║ Install Java with: ║\n"
|
| 30 |
+
f"║ {_install_cmd:<58}║\n"
|
| 31 |
+
"║ ║\n"
|
| 32 |
+
"║ Then re-run your script. ║\n"
|
| 33 |
+
"╚══════════════════════════════════════════════════════════════╝\n"
|
| 34 |
+
)
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def _get_install_cmd() -> str:
|
| 38 |
+
if sys.platform == "linux":
|
| 39 |
+
# Try to detect distro
|
| 40 |
+
try:
|
| 41 |
+
out = subprocess.check_output(
|
| 42 |
+
["cat", "/etc/os-release"], text=True, stderr=subprocess.DEVNULL
|
| 43 |
+
)
|
| 44 |
+
if "ubuntu" in out.lower() or "debian" in out.lower():
|
| 45 |
+
return "sudo apt install default-jre"
|
| 46 |
+
if "fedora" in out.lower() or "rhel" in out.lower() or "centos" in out.lower():
|
| 47 |
+
return "sudo dnf install java-latest-openjdk"
|
| 48 |
+
if "arch" in out.lower():
|
| 49 |
+
return "sudo pacman -S jre-openjdk"
|
| 50 |
+
except Exception: # noqa: BLE001
|
| 51 |
+
pass
|
| 52 |
+
return "sudo apt install default-jre"
|
| 53 |
+
if sys.platform == "darwin":
|
| 54 |
+
return "brew install openjdk"
|
| 55 |
+
if sys.platform == "win32":
|
| 56 |
+
return "winget install Microsoft.OpenJDK.21"
|
| 57 |
+
return "Install Java from https://adoptium.net"
|
turk_tokenizer/_medical_vocab.py
ADDED
|
@@ -0,0 +1,139 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Domain vocabulary: medical, sports, tourism roots (Fix 6)."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
MEDICAL_ROOTS: dict[str, str] = {
|
| 6 |
+
"glomerül": "glomerül", "glomerülonefrit": "glomerülonefrit",
|
| 7 |
+
"nefron": "nefron", "nefropati": "nefropati",
|
| 8 |
+
"hepatosit": "hepatosit", "hepatit": "hepatit",
|
| 9 |
+
"eritrosit": "eritrosit", "lökosit": "lökosit",
|
| 10 |
+
"trombosit": "trombosit", "nötrofil": "nötrofil",
|
| 11 |
+
"eozinofil": "eozinofil", "bazofil": "bazofil",
|
| 12 |
+
"lenfosit": "lenfosit", "monosit": "monosit",
|
| 13 |
+
"makrofaj": "makrofaj", "antijen": "antijen",
|
| 14 |
+
"antikor": "antikor", "sitokin": "sitokin",
|
| 15 |
+
"reseptör": "reseptör", "ligand": "ligand",
|
| 16 |
+
"enzim": "enzim", "substrat": "substrat",
|
| 17 |
+
"inhibitör": "inhibitör", "agonist": "agonist",
|
| 18 |
+
"antagonist": "antagonist", "nöron": "nöron",
|
| 19 |
+
"sinaps": "sinaps", "akson": "akson",
|
| 20 |
+
"dendrit": "dendrit", "miyelin": "miyelin",
|
| 21 |
+
"nekroz": "nekroz", "apoptoz": "apoptoz",
|
| 22 |
+
"fibrozis": "fibrozis", "skleroz": "skleroz",
|
| 23 |
+
"stenoz": "stenoz", "embolizm": "embolizm",
|
| 24 |
+
"tromboz": "tromboz", "iskemi": "iskemi",
|
| 25 |
+
"hipoksi": "hipoksi", "asidoz": "asidoz",
|
| 26 |
+
"alkaloz": "alkaloz", "sepsis": "sepsis",
|
| 27 |
+
"edema": "edema", "enflamasyon": "enflamasyon",
|
| 28 |
+
"granülom": "granülom", "metaplazi": "metaplazi",
|
| 29 |
+
"displazi": "displazi", "neoplazi": "neoplazi",
|
| 30 |
+
"karsinoma": "karsinoma", "sarkom": "sarkom",
|
| 31 |
+
"lenfoma": "lenfoma", "lösemi": "lösemi",
|
| 32 |
+
"melanom": "melanom", "adenom": "adenom",
|
| 33 |
+
"polip": "polip", "kist": "kist",
|
| 34 |
+
"abse": "abse", "fistül": "fistül",
|
| 35 |
+
"perforasyon": "perforasyon", "obstrüksiyon": "obstrüksiyon",
|
| 36 |
+
"invajinasyon": "invajinasyon",
|
| 37 |
+
"intususepsiyon": "intususepsiyon",
|
| 38 |
+
"atelektazi": "atelektazi", "pnömotoraks": "pnömotoraks",
|
| 39 |
+
"hidrotoraks": "hidrotoraks", "plevral": "plevral",
|
| 40 |
+
"bakteri": "bakteri", "virüs": "virüs",
|
| 41 |
+
"parazit": "parazit", "mantar": "mantar",
|
| 42 |
+
"protozoa": "protozoa", "helmint": "helmint",
|
| 43 |
+
"endotoksin": "endotoksin", "ekzotoksin": "ekzotoksin",
|
| 44 |
+
"antibiyotik": "antibiyotik", "antiviral": "antiviral",
|
| 45 |
+
"antifungal": "antifungal", "streptokokus": "streptokokus",
|
| 46 |
+
"stafilokokus": "stafilokokus", "escherichia": "escherichia",
|
| 47 |
+
"klebsiella": "klebsiella", "pseudomonas": "pseudomonas",
|
| 48 |
+
"salmonella": "salmonella", "shigella": "shigella",
|
| 49 |
+
"mycobacterium": "mycobacterium",
|
| 50 |
+
"helicobacter": "helicobacter",
|
| 51 |
+
"candida": "candida", "aspergillus": "aspergillus",
|
| 52 |
+
"plasmodium": "plasmodium", "toxoplasma": "toxoplasma",
|
| 53 |
+
"influenza": "influenza", "rotavirus": "rotavirus",
|
| 54 |
+
"adenovirus": "adenovirus", "coronavirus": "coronavirus",
|
| 55 |
+
"farmakokinetik": "farmakokinetik",
|
| 56 |
+
"farmakodinami": "farmakodinami",
|
| 57 |
+
"biyoyararlanım": "biyoyararlanım",
|
| 58 |
+
"metabolit": "metabolit", "toksisite": "toksisite",
|
| 59 |
+
"plazma": "plazma", "serum": "serum",
|
| 60 |
+
"doz": "doz",
|
| 61 |
+
"morfin": "morfin", "kodein": "kodein",
|
| 62 |
+
"aspirin": "aspirin", "paracetamol": "paracetamol",
|
| 63 |
+
"ibuprofen": "ibuprofen", "warfarin": "warfarin",
|
| 64 |
+
"heparin": "heparin", "insülin": "insülin",
|
| 65 |
+
"kortizol": "kortizol", "kortikosteroid": "kortikosteroid",
|
| 66 |
+
"betabloker": "betabloker", "diüretik": "diüretik",
|
| 67 |
+
"statin": "statin", "metformin": "metformin",
|
| 68 |
+
"semptom": "semptom", "bulgu": "bulgu",
|
| 69 |
+
"tanı": "tanı", "tedavi": "tedavi",
|
| 70 |
+
"prognoz": "prognoz", "komplikasyon": "komplikasyon",
|
| 71 |
+
"kontrendikasyon": "kontrendikasyon",
|
| 72 |
+
"endikasyon": "endikasyon", "biyopsi": "biyopsi",
|
| 73 |
+
"aspirasyon": "aspirasyon", "transplantasyon": "transplantasyon",
|
| 74 |
+
"transplant": "transplant", "diyaliz": "diyaliz",
|
| 75 |
+
"kemoterapi": "kemoterapi", "radyoterapi": "radyoterapi",
|
| 76 |
+
"immunoterapi": "immunoterapi",
|
| 77 |
+
"laparoskopi": "laparoskopi", "endoskopi": "endoskopi",
|
| 78 |
+
"kolonoskopi": "kolonoskopi", "bronkoskopi": "bronkoskopi",
|
| 79 |
+
"kateter": "kateter", "stent": "stent",
|
| 80 |
+
"bypass": "bypass", "anastomoz": "anastomoz",
|
| 81 |
+
"kardiyak": "kardiyak", "pulmoner": "pulmoner",
|
| 82 |
+
"hepatik": "hepatik", "renal": "renal",
|
| 83 |
+
"serebral": "serebral", "vasküler": "vasküler",
|
| 84 |
+
"endokrin": "endokrin", "immün": "immün",
|
| 85 |
+
"konjenital": "konjenital", "herediter": "herediter",
|
| 86 |
+
"otoimmün": "otoimmün", "idiyopatik": "idiyopatik",
|
| 87 |
+
"akut": "akut", "kronik": "kronik",
|
| 88 |
+
"primer": "primer", "sekonder": "sekonder",
|
| 89 |
+
"malign": "malign", "benign": "benign",
|
| 90 |
+
"solid": "solid", "kistik": "kistik",
|
| 91 |
+
"bilateral": "bilateral", "unilateral": "unilateral",
|
| 92 |
+
"sistemik": "sistemik", "lokal": "lokal",
|
| 93 |
+
"diffüz": "diffüz", "fokal": "fokal",
|
| 94 |
+
"infeksiyon": "infeksiyon", "enfeksiyon": "enfeksiyon",
|
| 95 |
+
"subakut": "subakut", "subklinik": "subklinik",
|
| 96 |
+
"progesteron": "progesteron", "prolaktin": "prolaktin",
|
| 97 |
+
"prostaglandin": "prostaglandin",
|
| 98 |
+
"displazi": "displazi", "disfaji": "disfaji",
|
| 99 |
+
"disfonksiyon": "disfonksiyon",
|
| 100 |
+
"hemoglobin": "hemoglobin", "hematokrit": "hematokrit",
|
| 101 |
+
"kreatinin": "kreatinin", "üre": "üre",
|
| 102 |
+
"glukoz": "glukoz", "kolesterol": "kolesterol",
|
| 103 |
+
"trigliserit": "trigliserit", "albumin": "albumin",
|
| 104 |
+
"bilirubin": "bilirubin", "transaminaz": "transaminaz",
|
| 105 |
+
"amilaz": "amilaz", "lipaz": "lipaz",
|
| 106 |
+
"troponin": "troponin", "kreatinkinaz": "kreatinkinaz",
|
| 107 |
+
"prokalsitonin": "prokalsitonin",
|
| 108 |
+
}
|
| 109 |
+
|
| 110 |
+
SPORTS_ROOTS: dict[str, str] = {
|
| 111 |
+
"lig": "lig", "kulüp": "kulüp",
|
| 112 |
+
"şampiyon": "şampiyon", "turnuva": "turnuva",
|
| 113 |
+
"kupa": "kupa", "finalist": "finalist",
|
| 114 |
+
"semifinal": "semifinal", "stadyum": "stadyum",
|
| 115 |
+
"transfer": "transfer", "bonservis": "bonservis",
|
| 116 |
+
"futbolcu": "futbolcu", "kaleci": "kaleci",
|
| 117 |
+
"forvet": "forvet", "defans": "defans",
|
| 118 |
+
"offside": "offside", "penaltı": "penaltı",
|
| 119 |
+
"frikik": "frikik", "korner": "korner",
|
| 120 |
+
}
|
| 121 |
+
|
| 122 |
+
TOURISM_ROOTS: dict[str, str] = {
|
| 123 |
+
"otel": "otel", "hostel": "hostel",
|
| 124 |
+
"resort": "resort", "transfer": "transfer",
|
| 125 |
+
"rezervasyon": "rezervasyon",
|
| 126 |
+
"bagaj": "bagaj", "terminal": "terminal",
|
| 127 |
+
"destinasyon": "destinasyon",
|
| 128 |
+
"tur": "tur", "turist": "turist",
|
| 129 |
+
"turizm": "turizm", "rehber": "rehber",
|
| 130 |
+
"konaklama": "konaklama", "kapasite": "kapasite",
|
| 131 |
+
"sezon": "sezon", "charter": "charter",
|
| 132 |
+
"paket": "paket", "voucher": "voucher",
|
| 133 |
+
"menü": "menü", "restoran": "restoran",
|
| 134 |
+
"spa": "spa", "havuz": "havuz",
|
| 135 |
+
"suit": "suit", "standart": "standart",
|
| 136 |
+
"delüks": "delüks",
|
| 137 |
+
}
|
| 138 |
+
|
| 139 |
+
ALL_DOMAIN_ROOTS: dict[str, str] = {**MEDICAL_ROOTS, **SPORTS_ROOTS, **TOURISM_ROOTS}
|
turk_tokenizer/_normalizer.py
ADDED
|
@@ -0,0 +1,128 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Fix 8: Special token normalization (NUM, DATE, URL, MENTION, HASHTAG, EMOJI)."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import re
|
| 6 |
+
|
| 7 |
+
MONTH_NAMES = {
|
| 8 |
+
"ocak","şubat","mart","nisan","mayıs","haziran",
|
| 9 |
+
"temmuz","ağustos","eylül","ekim","kasım","aralık",
|
| 10 |
+
"january","february","march","april","may","june",
|
| 11 |
+
"july","august","september","october","november","december",
|
| 12 |
+
}
|
| 13 |
+
|
| 14 |
+
UNITS = {
|
| 15 |
+
"km","m","cm","mm","nm",
|
| 16 |
+
"kg","g","mg","ton",
|
| 17 |
+
"sn","dk","sa","ms",
|
| 18 |
+
"tl","usd","eur","gbp",
|
| 19 |
+
"kb","mb","gb","tb","pb",
|
| 20 |
+
"ml","mcg","meq","iu","mmhg","mosm",
|
| 21 |
+
"hz","mhz","ghz","watt","kw","mw","kcal","cal",
|
| 22 |
+
}
|
| 23 |
+
|
| 24 |
+
ROMAN_NUMERALS = {
|
| 25 |
+
"i","ii","iii","iv","vi","vii","viii","ix",
|
| 26 |
+
"xi","xii","xiii","xiv","xv","xvi","xvii","xviii","xix","xx",
|
| 27 |
+
}
|
| 28 |
+
|
| 29 |
+
URL_RE = re.compile(r'https?://\S+|www\.\S+', re.IGNORECASE)
|
| 30 |
+
MENTION_RE = re.compile(r'@[\w\u00C0-\u024F]+')
|
| 31 |
+
HASHTAG_RE = re.compile(r'#[\w\u00C0-\u024F]+')
|
| 32 |
+
NUMBER_RE = re.compile(
|
| 33 |
+
r'%\d+[\.,]?\d*'
|
| 34 |
+
r'|\d+[\.,]\d+'
|
| 35 |
+
r'|\d{1,3}(?:\.\d{3})+'
|
| 36 |
+
r'|\d+%'
|
| 37 |
+
r'|\d+/\d+'
|
| 38 |
+
)
|
| 39 |
+
DATE_RE = re.compile(
|
| 40 |
+
r'\d{1,2}[./\-]\d{1,2}[./\-]\d{2,4}'
|
| 41 |
+
r'|\d{4}[./\-]\d{1,2}[./\-]\d{1,2}'
|
| 42 |
+
)
|
| 43 |
+
CURRENCY_RE = re.compile(r'[$€£¥₺₽]\d+[\.,]?\d*|\d+[\.,]?\d*[$€£¥₺₽]')
|
| 44 |
+
TEXT_EMOJI_RE = re.compile(r'[:;=]-?[\)\(\]\[dDpPoO3]|<3')
|
| 45 |
+
UNICODE_EMOJI_RE = re.compile(
|
| 46 |
+
"[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF"
|
| 47 |
+
"\U0001F680-\U0001F6FF\U0001F1E0-\U0001F1FF"
|
| 48 |
+
"\U00002700-\U000027BF\U0001F900-\U0001F9FF"
|
| 49 |
+
"\U00002600-\U000026FF]+",
|
| 50 |
+
flags=re.UNICODE,
|
| 51 |
+
)
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
def preprocess_special_tokens(text: str) -> tuple[str, list[dict]]:
|
| 55 |
+
"""Replace special tokens with placeholders before base tokenization."""
|
| 56 |
+
placeholders: list[dict] = []
|
| 57 |
+
counter = [0]
|
| 58 |
+
|
| 59 |
+
def _ph(token_type: str, original: str) -> str:
|
| 60 |
+
ph = f"\x00{token_type}{counter[0]}\x00"
|
| 61 |
+
placeholders.append({"placeholder": ph, "type": token_type, "original": original})
|
| 62 |
+
counter[0] += 1
|
| 63 |
+
return ph
|
| 64 |
+
|
| 65 |
+
def _replace(pattern: re.Pattern, ttype: str, t: str) -> str:
|
| 66 |
+
return pattern.sub(lambda m: _ph(ttype, m.group(0)), t)
|
| 67 |
+
|
| 68 |
+
text = _replace(URL_RE, "URL", text)
|
| 69 |
+
text = _replace(MENTION_RE, "MENTION", text)
|
| 70 |
+
text = _replace(HASHTAG_RE, "HASHTAG", text)
|
| 71 |
+
text = _replace(DATE_RE, "DATE", text)
|
| 72 |
+
text = _replace(CURRENCY_RE, "UNIT", text)
|
| 73 |
+
text = _replace(NUMBER_RE, "NUM", text)
|
| 74 |
+
text = _replace(UNICODE_EMOJI_RE, "EMOJI", text)
|
| 75 |
+
text = _replace(TEXT_EMOJI_RE, "EMOJI", text)
|
| 76 |
+
return text, placeholders
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
def restore_special_tokens(tokens: list[dict], placeholders: list[dict]) -> list[dict]:
|
| 80 |
+
"""Restore placeholders in the token stream."""
|
| 81 |
+
if not placeholders:
|
| 82 |
+
return tokens
|
| 83 |
+
|
| 84 |
+
ph_map = {p["placeholder"]: p for p in placeholders}
|
| 85 |
+
restored: set[str] = set()
|
| 86 |
+
result: list[dict] = []
|
| 87 |
+
|
| 88 |
+
for tok in tokens:
|
| 89 |
+
raw = tok["token"]
|
| 90 |
+
matched = next(((ph, info) for ph, info in ph_map.items() if ph in raw), None)
|
| 91 |
+
if matched:
|
| 92 |
+
ph, info = matched
|
| 93 |
+
if ph not in restored:
|
| 94 |
+
restored.add(ph)
|
| 95 |
+
ttype = info["type"]
|
| 96 |
+
result.append({
|
| 97 |
+
"token": f" {info['original']}",
|
| 98 |
+
"type": ttype,
|
| 99 |
+
f"_{ttype.lower()}": True,
|
| 100 |
+
})
|
| 101 |
+
else:
|
| 102 |
+
result.append(tok)
|
| 103 |
+
|
| 104 |
+
return result
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
def reclassify_numbers_in_tokens(tokens: list[dict]) -> list[dict]:
|
| 108 |
+
"""Catch remaining number/unit tokens missed by pre-tokenization."""
|
| 109 |
+
result: list[dict] = []
|
| 110 |
+
for tok in tokens:
|
| 111 |
+
if tok["type"] not in ("BPE", "ROOT"):
|
| 112 |
+
result.append(tok)
|
| 113 |
+
continue
|
| 114 |
+
|
| 115 |
+
raw = tok["token"].strip()
|
| 116 |
+
|
| 117 |
+
if NUMBER_RE.fullmatch(raw):
|
| 118 |
+
result.append({**tok, "type": "NUM", "_num": True})
|
| 119 |
+
elif raw.lower() in UNITS and tok["type"] == "BPE":
|
| 120 |
+
result.append({**tok, "type": "UNIT", "_unit": True})
|
| 121 |
+
elif raw.lower() in ROMAN_NUMERALS and tok["type"] == "BPE":
|
| 122 |
+
result.append({**tok, "type": "NUM", "_roman": True})
|
| 123 |
+
elif raw.lower() in MONTH_NAMES and tok["type"] == "BPE":
|
| 124 |
+
result.append({**tok, "type": "ROOT", "_month": True})
|
| 125 |
+
else:
|
| 126 |
+
result.append(tok)
|
| 127 |
+
|
| 128 |
+
return result
|
turk_tokenizer/_preprocessor.py
ADDED
|
@@ -0,0 +1,163 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Fix 1: ALL CAPS inflation fix. Fix 2: Apostrophe / code-switching split."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import re
|
| 6 |
+
|
| 7 |
+
TR_CHARS = set("çğışöüÇĞİŞÖÜ")
|
| 8 |
+
|
| 9 |
+
KNOWN_TURKISH_BASES = {
|
| 10 |
+
"istanbul", "ankara", "izmir", "türkiye", "anadolu", "boğaziçi",
|
| 11 |
+
"cumhuriyet", "atatürk", "karadeniz", "marmara", "ege", "akdeniz",
|
| 12 |
+
"temmuz", "ocak", "şubat", "mart", "nisan", "mayıs", "haziran",
|
| 13 |
+
"ağustos", "eylül", "ekim", "kasım", "aralık",
|
| 14 |
+
}
|
| 15 |
+
|
| 16 |
+
KNOWN_FOREIGN_BASES = {
|
| 17 |
+
"python", "zoom", "google", "github", "twitter", "youtube",
|
| 18 |
+
"instagram", "linkedin", "facebook", "whatsapp", "telegram",
|
| 19 |
+
"numpy", "pandas", "django", "flask", "react", "javascript",
|
| 20 |
+
"typescript", "docker", "linux", "windows", "android", "iphone",
|
| 21 |
+
"chatgpt", "openai", "claude", "gemini", "llama", "bert",
|
| 22 |
+
"excel", "powerpoint", "outlook", "teams", "slack", "notion",
|
| 23 |
+
"spotify", "netflix", "amazon", "alibaba", "huawei", "samsung",
|
| 24 |
+
}
|
| 25 |
+
|
| 26 |
+
TURKISH_SUFFIXES_AFTER_APOSTROPHE = sorted(
|
| 27 |
+
[
|
| 28 |
+
"nın","nin","nun","nün","dan","den","tan","ten",
|
| 29 |
+
"da","de","ta","te","ya","ye","nda","nde",
|
| 30 |
+
"yı","yi","yu","yü","nı","ni","nu","nü",
|
| 31 |
+
"lar","ler","lara","lere","ları","leri",
|
| 32 |
+
"ım","im","um","üm","ın","in","un","ün",
|
| 33 |
+
"mız","miz","muz","müz","nız","niz","nuz","nüz",
|
| 34 |
+
"dır","dir","dur","dür","tır","tir","tur","tür",
|
| 35 |
+
"ki","li","lı","lu","lü","sız","siz","suz","süz",
|
| 36 |
+
"a","e","ı","i","u","ü",
|
| 37 |
+
],
|
| 38 |
+
key=len,
|
| 39 |
+
reverse=True,
|
| 40 |
+
)
|
| 41 |
+
|
| 42 |
+
_APO_SEP = "\ue001"
|
| 43 |
+
_APO_RE = re.compile(
|
| 44 |
+
r"([A-Za-zÇçĞğİıÖöŞşÜü0-9]{2,})['\u2019]([A-Za-zÇçĞğİıÖöŞşÜü]{1,6})\b"
|
| 45 |
+
)
|
| 46 |
+
_CAPS_RE = re.compile(r'\b([A-ZÇĞİÖŞÜ]{2,})\b')
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
def _is_turkish_base(word: str) -> bool:
|
| 50 |
+
w = word.lower()
|
| 51 |
+
if w in KNOWN_FOREIGN_BASES:
|
| 52 |
+
return False
|
| 53 |
+
if any(c in TR_CHARS for c in word):
|
| 54 |
+
return True
|
| 55 |
+
if w in KNOWN_TURKISH_BASES:
|
| 56 |
+
return True
|
| 57 |
+
if len(w) < 4:
|
| 58 |
+
return True
|
| 59 |
+
return False
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
# ── Fix 1: ALL CAPS ───────────────────────────────────────────────────────────
|
| 63 |
+
|
| 64 |
+
def _fix_all_caps(text: str) -> tuple[str, set]:
|
| 65 |
+
caps: set[str] = set()
|
| 66 |
+
|
| 67 |
+
def _replace(m: re.Match) -> str:
|
| 68 |
+
w = m.group(1)
|
| 69 |
+
caps.add(w.lower())
|
| 70 |
+
return w.lower()
|
| 71 |
+
|
| 72 |
+
return _CAPS_RE.sub(_replace, text), caps
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
def _restore_caps_tokens(tokens: list[dict], caps: set) -> list[dict]:
|
| 76 |
+
result: list[dict] = []
|
| 77 |
+
i = 0
|
| 78 |
+
while i < len(tokens):
|
| 79 |
+
tok = tokens[i]
|
| 80 |
+
raw_low = tok["token"].strip().lower()
|
| 81 |
+
|
| 82 |
+
if tok["type"] == "ROOT" and raw_low in caps:
|
| 83 |
+
result.append({"token": "<uppercase_word>", "type": "ROOT", "_caps": True})
|
| 84 |
+
result.append(tok)
|
| 85 |
+
i += 1
|
| 86 |
+
continue
|
| 87 |
+
|
| 88 |
+
if tok["type"] == "BPE" and tok["token"].startswith(" "):
|
| 89 |
+
combined = raw_low
|
| 90 |
+
lookahead = [tok]
|
| 91 |
+
j = i + 1
|
| 92 |
+
while j < len(tokens):
|
| 93 |
+
nt = tokens[j]
|
| 94 |
+
if not nt["token"].startswith(" "):
|
| 95 |
+
combined += nt["token"].strip().lower()
|
| 96 |
+
lookahead.append(nt)
|
| 97 |
+
j += 1
|
| 98 |
+
if combined in caps:
|
| 99 |
+
break
|
| 100 |
+
if len(combined) > 8:
|
| 101 |
+
break
|
| 102 |
+
else:
|
| 103 |
+
break
|
| 104 |
+
if combined in caps:
|
| 105 |
+
result.append({"token": "<uppercase_word>", "type": "ROOT", "_caps": True})
|
| 106 |
+
result.append({"token": f" {combined}", "type": "ROOT",
|
| 107 |
+
"_acronym": True, "_caps": True})
|
| 108 |
+
i = j
|
| 109 |
+
continue
|
| 110 |
+
|
| 111 |
+
result.append(tok)
|
| 112 |
+
i += 1
|
| 113 |
+
|
| 114 |
+
return result
|
| 115 |
+
|
| 116 |
+
|
| 117 |
+
# ── Fix 2: Apostrophe split ───────────────────────────────────────────────────
|
| 118 |
+
|
| 119 |
+
def _split_apostrophe(text: str) -> str:
|
| 120 |
+
def _repl(m: re.Match) -> str:
|
| 121 |
+
base, suffix = m.group(1), m.group(2)
|
| 122 |
+
if _is_turkish_base(base):
|
| 123 |
+
return m.group(0)
|
| 124 |
+
if any(suffix.lower() == s for s in TURKISH_SUFFIXES_AFTER_APOSTROPHE):
|
| 125 |
+
return f"{base} {_APO_SEP} {suffix}"
|
| 126 |
+
return m.group(0)
|
| 127 |
+
|
| 128 |
+
return _APO_RE.sub(_repl, text)
|
| 129 |
+
|
| 130 |
+
|
| 131 |
+
def _merge_apostrophe_tokens(tokens: list[dict]) -> list[dict]:
|
| 132 |
+
result: list[dict] = []
|
| 133 |
+
i = 0
|
| 134 |
+
while i < len(tokens):
|
| 135 |
+
tok = tokens[i]
|
| 136 |
+
if _APO_SEP in tok["token"].strip():
|
| 137 |
+
if result:
|
| 138 |
+
result[-1]["type"] = "ROOT"
|
| 139 |
+
result[-1]["_foreign"] = True
|
| 140 |
+
i += 1
|
| 141 |
+
if i < len(tokens):
|
| 142 |
+
tokens[i]["type"] = "SUFFIX"
|
| 143 |
+
tokens[i]["_apo_suffix"] = True
|
| 144 |
+
result.append(tokens[i])
|
| 145 |
+
i += 1
|
| 146 |
+
else:
|
| 147 |
+
result.append(tok)
|
| 148 |
+
i += 1
|
| 149 |
+
return result
|
| 150 |
+
|
| 151 |
+
|
| 152 |
+
# ── Combined pre / post ───────────────────────────────────────────────────────
|
| 153 |
+
|
| 154 |
+
def preprocess(text: str) -> tuple[str, set]:
|
| 155 |
+
text, caps = _fix_all_caps(text)
|
| 156 |
+
text = _split_apostrophe(text)
|
| 157 |
+
return text, caps
|
| 158 |
+
|
| 159 |
+
|
| 160 |
+
def postprocess(tokens: list[dict], caps: set) -> list[dict]:
|
| 161 |
+
tokens = _restore_caps_tokens(tokens, caps)
|
| 162 |
+
tokens = _merge_apostrophe_tokens(tokens)
|
| 163 |
+
return tokens
|
turk_tokenizer/_root_validator.py
ADDED
|
@@ -0,0 +1,206 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Zemberek-based root validation and correction (Fix 4)."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import os
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
|
| 8 |
+
# ── Zemberek JAR: bundled with package ───────────────────────────────────────
|
| 9 |
+
|
| 10 |
+
_DATA_DIR = Path(__file__).parent / "data"
|
| 11 |
+
JAR_PATH = _DATA_DIR / "zemberek-full.jar"
|
| 12 |
+
|
| 13 |
+
ZEMBEREK_AVAILABLE = False
|
| 14 |
+
_morphology = None
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def _init_zemberek() -> None:
|
| 18 |
+
global ZEMBEREK_AVAILABLE, _morphology
|
| 19 |
+
|
| 20 |
+
if not JAR_PATH.exists():
|
| 21 |
+
print(
|
| 22 |
+
f"[TurkTokenizer] zemberek-full.jar not found at {JAR_PATH}\n"
|
| 23 |
+
" Root validation disabled — morphological fixes will be limited."
|
| 24 |
+
)
|
| 25 |
+
return
|
| 26 |
+
|
| 27 |
+
try:
|
| 28 |
+
import jpype # noqa: PLC0415
|
| 29 |
+
|
| 30 |
+
if not jpype.isJVMStarted():
|
| 31 |
+
jpype.startJVM(
|
| 32 |
+
jpype.getDefaultJVMPath(),
|
| 33 |
+
"-ea",
|
| 34 |
+
f"-Djava.class.path={JAR_PATH}",
|
| 35 |
+
convertStrings=False,
|
| 36 |
+
)
|
| 37 |
+
|
| 38 |
+
TurkishMorphology = jpype.JClass("zemberek.morphology.TurkishMorphology")
|
| 39 |
+
_morphology = TurkishMorphology.createWithDefaults()
|
| 40 |
+
ZEMBEREK_AVAILABLE = True
|
| 41 |
+
|
| 42 |
+
except ImportError:
|
| 43 |
+
print("[TurkTokenizer] jpype1 not installed → pip install jpype1")
|
| 44 |
+
except Exception as exc: # noqa: BLE001
|
| 45 |
+
print(f"[TurkTokenizer] Zemberek init failed: {exc}")
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
_init_zemberek()
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
# ── Zemberek API helpers ──────────────────────────────────────────────────────
|
| 52 |
+
|
| 53 |
+
def _jstr(s: str):
|
| 54 |
+
import jpype # noqa: PLC0415
|
| 55 |
+
return jpype.JString(s)
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
def analyze_word(word: str) -> list[dict]:
|
| 59 |
+
"""Return all Zemberek analyses for a single word."""
|
| 60 |
+
if not ZEMBEREK_AVAILABLE:
|
| 61 |
+
return []
|
| 62 |
+
try:
|
| 63 |
+
wa = _morphology.analyze(_jstr(word))
|
| 64 |
+
return [
|
| 65 |
+
{
|
| 66 |
+
"lemma": str(sa.getDictionaryItem().lemma),
|
| 67 |
+
"pos": str(sa.getPos().shortForm),
|
| 68 |
+
"morphemes":[str(m) for m in sa.getMorphemes()],
|
| 69 |
+
"surface": str(sa.surfaceForm()),
|
| 70 |
+
}
|
| 71 |
+
for sa in wa.getAnalysisResults()
|
| 72 |
+
]
|
| 73 |
+
except Exception: # noqa: BLE001
|
| 74 |
+
return []
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
def get_root_and_suffixes(word: str) -> dict | None:
|
| 78 |
+
"""Return root + suffix list for a word, or None if unknown."""
|
| 79 |
+
analyses = analyze_word(word)
|
| 80 |
+
if not analyses:
|
| 81 |
+
return None
|
| 82 |
+
a = analyses[0]
|
| 83 |
+
return {"root": a["lemma"], "suffixes": a["morphemes"][1:], "pos": a["pos"]}
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
# ── Heuristic fallback (no Zemberek) ─────────────────────────────────────────
|
| 87 |
+
|
| 88 |
+
_SPURIOUS_SHORT_ROOTS = {"oğ", "gök", "zo", "me", "im", "pro", "go", "da", "al"}
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
def _is_spurious_root(root: str, next_tokens: list[dict]) -> bool:
|
| 92 |
+
if root.strip().lower() not in _SPURIOUS_SHORT_ROOTS:
|
| 93 |
+
return False
|
| 94 |
+
return sum(1 for t in next_tokens[:3] if t["type"] == "BPE") >= 2
|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
# ── Main validation ───────────────────────────────────────────────────────────
|
| 98 |
+
|
| 99 |
+
def build_correction_map(
|
| 100 |
+
original_words: list[str], base_tokenizer
|
| 101 |
+
) -> dict[str, str]:
|
| 102 |
+
"""Build a {tokenizer_root → zemberek_root} correction map."""
|
| 103 |
+
correction_map: dict[str, str] = {}
|
| 104 |
+
|
| 105 |
+
for word in original_words:
|
| 106 |
+
w = word.lower().strip("'\".,!?;:()")
|
| 107 |
+
if not w or len(w) < 3:
|
| 108 |
+
continue
|
| 109 |
+
|
| 110 |
+
z = get_root_and_suffixes(w)
|
| 111 |
+
if not z or z["root"] == "UNK":
|
| 112 |
+
continue
|
| 113 |
+
z_root = z["root"].lower()
|
| 114 |
+
|
| 115 |
+
try:
|
| 116 |
+
toks = base_tokenizer.tokenize_text(w)
|
| 117 |
+
t_root = next(
|
| 118 |
+
(t["token"].strip().lower() for t in toks if t["type"] == "ROOT"),
|
| 119 |
+
None,
|
| 120 |
+
)
|
| 121 |
+
except Exception: # noqa: BLE001
|
| 122 |
+
continue
|
| 123 |
+
|
| 124 |
+
if not t_root or t_root == z_root:
|
| 125 |
+
continue
|
| 126 |
+
|
| 127 |
+
diff = len(z_root) - len(t_root)
|
| 128 |
+
if diff < 0 or diff > 4:
|
| 129 |
+
continue
|
| 130 |
+
if not z_root.startswith(t_root):
|
| 131 |
+
continue
|
| 132 |
+
|
| 133 |
+
correction_map[t_root] = z_root
|
| 134 |
+
|
| 135 |
+
return correction_map
|
| 136 |
+
|
| 137 |
+
|
| 138 |
+
def validate_roots(
|
| 139 |
+
tokens: list[dict],
|
| 140 |
+
original_words: list[str],
|
| 141 |
+
base_tokenizer=None,
|
| 142 |
+
) -> list[dict]:
|
| 143 |
+
"""Apply Zemberek root corrections to the token stream."""
|
| 144 |
+
if not ZEMBEREK_AVAILABLE:
|
| 145 |
+
result = []
|
| 146 |
+
for i, tok in enumerate(tokens):
|
| 147 |
+
if tok["type"] == "ROOT" and not tok["token"].strip().startswith("<"):
|
| 148 |
+
if _is_spurious_root(tok["token"], tokens[i + 1 : i + 5]):
|
| 149 |
+
tok = {**tok, "_suspicious": True}
|
| 150 |
+
result.append(tok)
|
| 151 |
+
return result
|
| 152 |
+
|
| 153 |
+
corr = (
|
| 154 |
+
build_correction_map(original_words, base_tokenizer)
|
| 155 |
+
if base_tokenizer is not None
|
| 156 |
+
else {}
|
| 157 |
+
)
|
| 158 |
+
|
| 159 |
+
result = []
|
| 160 |
+
for tok in tokens:
|
| 161 |
+
if tok["type"] != "ROOT" or tok["token"].strip().startswith("<"):
|
| 162 |
+
result.append(tok)
|
| 163 |
+
continue
|
| 164 |
+
|
| 165 |
+
surface = tok["token"].strip().lower()
|
| 166 |
+
correct = corr.get(surface)
|
| 167 |
+
|
| 168 |
+
if correct and correct != surface:
|
| 169 |
+
leading = " " if tok["token"].startswith(" ") else ""
|
| 170 |
+
tok = {
|
| 171 |
+
**tok,
|
| 172 |
+
"token": leading + correct,
|
| 173 |
+
"_original_token": tok["token"],
|
| 174 |
+
"_root_corrected": True,
|
| 175 |
+
"_note": f"root corrected: '{surface}' → '{correct}'",
|
| 176 |
+
}
|
| 177 |
+
|
| 178 |
+
result.append(tok)
|
| 179 |
+
|
| 180 |
+
return result
|
| 181 |
+
|
| 182 |
+
|
| 183 |
+
def disambiguate_sentence(words: list[str]) -> list[dict | None]:
|
| 184 |
+
"""Sentence-level Zemberek disambiguation."""
|
| 185 |
+
if not ZEMBEREK_AVAILABLE:
|
| 186 |
+
return [None] * len(words)
|
| 187 |
+
try:
|
| 188 |
+
sa_result = _morphology.analyzeAndDisambiguate(_jstr(" ".join(words)))
|
| 189 |
+
best = sa_result.bestAnalysis()
|
| 190 |
+
out = []
|
| 191 |
+
for i in range(best.size()):
|
| 192 |
+
try:
|
| 193 |
+
sa = best.get(i)
|
| 194 |
+
item = sa.getDictionaryItem()
|
| 195 |
+
out.append({
|
| 196 |
+
"lemma": str(item.lemma),
|
| 197 |
+
"pos": str(sa.getPos().shortForm),
|
| 198 |
+
"morphemes": [str(m) for m in sa.getMorphemes()],
|
| 199 |
+
})
|
| 200 |
+
except Exception: # noqa: BLE001
|
| 201 |
+
out.append(None)
|
| 202 |
+
while len(out) < len(words):
|
| 203 |
+
out.append(None)
|
| 204 |
+
return out[: len(words)]
|
| 205 |
+
except Exception: # noqa: BLE001
|
| 206 |
+
return [analyze_word(w)[0] if analyze_word(w) else None for w in words]
|
turk_tokenizer/_suffix_expander.py
ADDED
|
@@ -0,0 +1,212 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Fix 3: BPE → SUFFIX reclassification. Fix 5: Punctuation → PUNCT."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
PUNCT_CHARS = set(
|
| 6 |
+
'?.,;:!-\u2013\u2014()[]{}"`/\\|@#$%^&*+=<>~'
|
| 7 |
+
'\u2019\u2018\u201c\u201d\u2032\u00ab\u00bb\u2039\u203a'
|
| 8 |
+
'\u2022\u2026\u00b7\u00b0\u00b1\u00d7\u00f7'
|
| 9 |
+
)
|
| 10 |
+
_PUNCT_DIGITS = set("0123456789")
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def _is_punct(token: str) -> bool:
|
| 14 |
+
s = token.strip()
|
| 15 |
+
if not s:
|
| 16 |
+
return False
|
| 17 |
+
return all(
|
| 18 |
+
c in PUNCT_CHARS or c in _PUNCT_DIGITS or (ord(c) > 0x02FF and not c.isalpha())
|
| 19 |
+
for c in s
|
| 20 |
+
)
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
# ── Suffix dictionary (260+ entries) ─────────────────────────────────────────
|
| 24 |
+
|
| 25 |
+
EXTENDED_SUFFIX_MAP: dict[str, str] = {
|
| 26 |
+
# Plural + case
|
| 27 |
+
"leri": "-PL+ACC", "ları": "-PL+ACC",
|
| 28 |
+
"lere": "-PL+DAT", "lara": "-PL+DAT",
|
| 29 |
+
"lerin": "-PL+GEN", "ların": "-PL+GEN",
|
| 30 |
+
"lerde": "-PL+LOC", "larda": "-PL+LOC",
|
| 31 |
+
"lerden": "-PL+ABL","lardan": "-PL+ABL",
|
| 32 |
+
"lerle": "-PL+INS", "larla": "-PL+INS",
|
| 33 |
+
"lerce": "-PL+EQU", "larca": "-PL+EQU",
|
| 34 |
+
# -yon / loanword suffixes
|
| 35 |
+
"yon": "-YON", "iyon": "-YON", "asyon": "-YON", "izasyon": "-YON",
|
| 36 |
+
# Adjective derivation
|
| 37 |
+
"al": "-ADJ", "el": "-ADJ", "ik": "-ADJ",
|
| 38 |
+
"sal": "-ADJ.TR", "sel": "-ADJ.TR",
|
| 39 |
+
# 1st/2nd plural possessive
|
| 40 |
+
"imiz": "-P1PL", "ımız": "-P1PL", "umuz": "-P1PL", "ümüz": "-P1PL",
|
| 41 |
+
"iniz": "-P2PL", "ınız": "-P2PL", "unuz": "-P2PL", "ünüz": "-P2PL",
|
| 42 |
+
# Arabic long vowels
|
| 43 |
+
"\u00e2": "-LONG_A", "\u00ee": "-LONG_I", "\u00fb": "-LONG_U",
|
| 44 |
+
# Roman numerals
|
| 45 |
+
"ii": "-ROM", "iii": "-ROM", "iv": "-ROM", "vi": "-ROM",
|
| 46 |
+
"vii": "-ROM", "viii": "-ROM", "ix": "-ROM", "xi": "-ROM",
|
| 47 |
+
"xii": "-ROM", "xiii": "-ROM", "xiv": "-ROM", "xv": "-ROM",
|
| 48 |
+
# Frequent BPE pieces
|
| 49 |
+
"eri": "-PL.SFX", "una": "-P3+DAT", "iril": "-PASS.SFX",
|
| 50 |
+
"yan": "-PART.ACT","ren": "-PART.ACT", "ıda": "-LOC.SFX",
|
| 51 |
+
"maya": "-NEG.INF", "üler": "-PL.SFX", "ıler": "-PL.SFX",
|
| 52 |
+
"ni": "-ACC.SFX", "ri": "-PL.SFX", "lan": "-PASS+NZ",
|
| 53 |
+
"on": "-YON.SFX",
|
| 54 |
+
# Possessive + case compounds
|
| 55 |
+
"ımı": "-P1+ACC", "imi": "-P1+ACC", "umu": "-P1+ACC", "ümü": "-P1+ACC",
|
| 56 |
+
"ıyla": "-INS.COMP","iyle": "-INS.COMP","uyla": "-INS.COMP","üyle": "-INS.COMP",
|
| 57 |
+
"kten": "-ABL.COMP","ğından": "-ABL.COMP","ğinden": "-ABL.COMP",
|
| 58 |
+
"yla": "-COM", "yle": "-COM",
|
| 59 |
+
# Abstract noun + possessive
|
| 60 |
+
"liği": "-ABSTR+P3", "lığı": "-ABSTR+P3",
|
| 61 |
+
"luğu": "-ABSTR+P3", "lüğü": "-ABSTR+P3",
|
| 62 |
+
"liğini": "-ABSTR+P3+ACC", "lığını": "-ABSTR+P3+ACC",
|
| 63 |
+
# -izm (ideology)
|
| 64 |
+
"izm": "-ISM", "izmi": "-ISM+P3", "izmde": "-ISM+LOC",
|
| 65 |
+
"izmden": "-ISM+ABL", "izmin": "-ISM+GEN",
|
| 66 |
+
# Aorist
|
| 67 |
+
"lir": "-AOR3SG", "lır": "-AOR3SG", "lur": "-AOR3SG", "lür": "-AOR3SG",
|
| 68 |
+
# 3sg possessive + case
|
| 69 |
+
"ine": "-P3+DAT", "ına": "-P3+DAT", "une": "-P3+DAT", "üne": "-P3+DAT",
|
| 70 |
+
"inde": "-P3+LOC", "ında": "-P3+LOC", "unda": "-P3+LOC", "ünde": "-P3+LOC",
|
| 71 |
+
"ini": "-P3+ACC", "ını": "-P3+ACC", "unu": "-P3+ACC", "ünü": "-P3+ACC",
|
| 72 |
+
"inden": "-P3+ABL","ından": "-P3+ABL","undan": "-P3+ABL","ünden": "-P3+ABL",
|
| 73 |
+
# -daki
|
| 74 |
+
"daki": "-LOC+REL","deki": "-LOC+REL","taki": "-LOC+REL","teki": "-LOC+REL",
|
| 75 |
+
# Passive + nominalization
|
| 76 |
+
"lan": "-PASS+NZ", "len": "-PASS+NZ",
|
| 77 |
+
# Verbal noun
|
| 78 |
+
"mesi": "-VN3", "ması": "-VN3",
|
| 79 |
+
"mesini": "-VN3+ACC", "masını": "-VN3+ACC",
|
| 80 |
+
"mesine": "-VN3+DAT", "masına": "-VN3+DAT",
|
| 81 |
+
"mesinde": "-VN3+LOC", "masında": "-VN3+LOC",
|
| 82 |
+
# Genitive + possessive
|
| 83 |
+
"ının": "-GEN+P", "inin": "-GEN+P", "unun": "-GEN+P", "ünün": "-GEN+P",
|
| 84 |
+
# Participle
|
| 85 |
+
"diği": "-PART", "dığı": "-PART", "tiği": "-PART", "tığı": "-PART",
|
| 86 |
+
"duğu": "-PART", "düğü": "-PART", "tuğu": "-PART", "tüğü": "-PART",
|
| 87 |
+
"ği": "-PART.SFX","ğı": "-PART.SFX","gu": "-PART.SFX","gü": "-PART.SFX",
|
| 88 |
+
# Negative verbal noun
|
| 89 |
+
"mas": "-NEG.VN", "mes": "-NEG.VN",
|
| 90 |
+
# 2sg imperative
|
| 91 |
+
"sin": "-IMP2", "sın": "-IMP2", "sun": "-IMP2", "sün": "-IMP2",
|
| 92 |
+
# Passive short
|
| 93 |
+
"ıl": "-PASS", "il": "-PASS", "ul": "-PASS", "ül": "-PASS",
|
| 94 |
+
# Causative + VN
|
| 95 |
+
"irme": "-CAUS+VN","ırma": "-CAUS+VN","urma": "-CAUS+VN",
|
| 96 |
+
"ürme": "-CAUS+VN","erme": "-CAUS+VN","arma": "-CAUS+VN",
|
| 97 |
+
# Accusative
|
| 98 |
+
"ı": "-ACC", "i": "-ACC", "u": "-ACC", "ü": "-ACC",
|
| 99 |
+
# Past tense
|
| 100 |
+
"dım": "-DI1SG","dim": "-DI1SG","dum": "-DI1SG","düm": "-DI1SG",
|
| 101 |
+
"tım": "-DI1SG","tim": "-DI1SG","tum": "-DI1SG","tüm": "-DI1SG",
|
| 102 |
+
"dık": "-DI1PL","dik": "-DI1PL","duk": "-DI1PL","dük": "-DI1PL",
|
| 103 |
+
"tık": "-DI1PL","tik": "-DI1PL","tuk": "-DI1PL","tük": "-DI1PL",
|
| 104 |
+
"dın": "-DI2SG","din": "-DI2SG","dun": "-DI2SG","dün": "-DI2SG",
|
| 105 |
+
"tın": "-DI2SG","tin": "-DI2SG","tun": "-DI2SG","tün": "-DI2SG",
|
| 106 |
+
"d": "-PAST", "t": "-PAST",
|
| 107 |
+
# Conditional
|
| 108 |
+
"sa": "-COND", "se": "-COND",
|
| 109 |
+
# Progressive
|
| 110 |
+
"yor": "-PROG",
|
| 111 |
+
# Simple past
|
| 112 |
+
"dı": "-PST", "di": "-PST", "du": "-PST", "dü": "-PST",
|
| 113 |
+
"tı": "-PST", "ti": "-PST", "tu": "-PST", "tü": "-PST",
|
| 114 |
+
# Aorist short
|
| 115 |
+
"ir": "-AOR", "ır": "-AOR", "ur": "-AOR", "ür": "-AOR",
|
| 116 |
+
"er": "-AOR", "ar": "-AOR",
|
| 117 |
+
# Evidential past
|
| 118 |
+
"mış": "-EVID","miş": "-EVID","muş": "-EVID","müş": "-EVID",
|
| 119 |
+
# Negation
|
| 120 |
+
"ma": "-NEG", "me": "-NEG",
|
| 121 |
+
"lama": "-VN+NEG","leme": "-VN+NEG",
|
| 122 |
+
# Abilitative
|
| 123 |
+
"bil": "-ABIL",
|
| 124 |
+
# Necessitative
|
| 125 |
+
"malı": "-NECES","meli": "-NECES",
|
| 126 |
+
# Infinitive
|
| 127 |
+
"mak": "-INF", "mek": "-INF",
|
| 128 |
+
# -ken (while/when)
|
| 129 |
+
"ken": "-WHEN",
|
| 130 |
+
# Converb
|
| 131 |
+
"arak": "-CONV","erek": "-CONV",
|
| 132 |
+
# With / without
|
| 133 |
+
"lı": "-WITH", "li": "-WITH", "lu": "-WITH", "lü": "-WITH",
|
| 134 |
+
# Agentive
|
| 135 |
+
"cı": "-AGT", "ci": "-AGT", "cu": "-AGT", "cü": "-AGT",
|
| 136 |
+
"çı": "-AGT", "çi": "-AGT", "çu": "-AGT", "çü": "-AGT",
|
| 137 |
+
# Abstract noun
|
| 138 |
+
"lık": "-ABSTR","lik": "-ABSTR","luk": "-ABSTR","lük": "-ABSTR",
|
| 139 |
+
"lığ": "-ABSTR","liğ": "-ABSTR",
|
| 140 |
+
# Optative 1pl
|
| 141 |
+
"elim": "-OPT1PL","alım": "-OPT1PL",
|
| 142 |
+
# Person suffixes
|
| 143 |
+
"ım": "-1SG", "im": "-1SG", "um": "-1SG", "üm": "-1SG",
|
| 144 |
+
"ın": "-2SG", "in": "-2SG", "un": "-2SG", "ün": "-2SG",
|
| 145 |
+
"iz": "-1PL", "ız": "-1PL", "uz": "-1PL", "üz": "-1PL",
|
| 146 |
+
"nız": "-2PL","niz": "-2PL","nuz": "-2PL","nüz": "-2PL",
|
| 147 |
+
# Question
|
| 148 |
+
"mı": "-Q", "mi": "-Q", "mu": "-Q", "mü": "-Q",
|
| 149 |
+
# Dative
|
| 150 |
+
"a": "-DAT", "e": "-DAT", "ya": "-DAT", "ye": "-DAT",
|
| 151 |
+
# Ablative
|
| 152 |
+
"dan": "-ABL","den": "-ABL","tan": "-ABL","ten": "-ABL",
|
| 153 |
+
# Locative
|
| 154 |
+
"da": "-LOC", "de": "-LOC", "ta": "-LOC", "te": "-LOC",
|
| 155 |
+
# Plural
|
| 156 |
+
"lar": "-PL", "ler": "-PL",
|
| 157 |
+
# 3sg possessive short
|
| 158 |
+
"sı": "-P3", "si": "-P3", "su": "-P3", "sü": "-P3",
|
| 159 |
+
# Genitive
|
| 160 |
+
"nin": "-GEN","nın": "-GEN","nun": "-GEN","nün": "-GEN",
|
| 161 |
+
# Instrumental
|
| 162 |
+
"le": "-INS", "la": "-INS",
|
| 163 |
+
# Equative
|
| 164 |
+
"ce": "-EQU","ca": "-EQU","çe": "-EQU","ça": "-EQU",
|
| 165 |
+
# Glide
|
| 166 |
+
"y": "-GLIDE",
|
| 167 |
+
}
|
| 168 |
+
|
| 169 |
+
_SUFFIX_MAP_SORTED = sorted(
|
| 170 |
+
EXTENDED_SUFFIX_MAP.items(), key=lambda x: len(x[0]), reverse=True
|
| 171 |
+
)
|
| 172 |
+
|
| 173 |
+
|
| 174 |
+
def reclassify_bpe_suffixes(tokens: list[dict]) -> list[dict]:
|
| 175 |
+
"""Reclassify BPE tokens: punctuation → PUNCT, word-internal suffixes → SUFFIX."""
|
| 176 |
+
result: list[dict] = []
|
| 177 |
+
for tok in tokens:
|
| 178 |
+
if tok["type"] != "BPE":
|
| 179 |
+
result.append(tok)
|
| 180 |
+
continue
|
| 181 |
+
|
| 182 |
+
raw = tok["token"]
|
| 183 |
+
stripped = raw.strip()
|
| 184 |
+
|
| 185 |
+
if _is_punct(raw):
|
| 186 |
+
result.append({**tok, "type": "PUNCT", "_punct": True})
|
| 187 |
+
continue
|
| 188 |
+
|
| 189 |
+
# Only reclassify tokens without a leading space (word-internal)
|
| 190 |
+
if raw != stripped:
|
| 191 |
+
result.append(tok)
|
| 192 |
+
continue
|
| 193 |
+
|
| 194 |
+
prev_ok = bool(result) and result[-1]["type"] in ("ROOT", "SUFFIX", "BPE")
|
| 195 |
+
if not prev_ok:
|
| 196 |
+
result.append(tok)
|
| 197 |
+
continue
|
| 198 |
+
|
| 199 |
+
sl = stripped.lower()
|
| 200 |
+
label = next((lbl for surf, lbl in _SUFFIX_MAP_SORTED if sl == surf), None)
|
| 201 |
+
if label:
|
| 202 |
+
result.append({
|
| 203 |
+
"token": raw,
|
| 204 |
+
"type": "SUFFIX",
|
| 205 |
+
"_reclassified": True,
|
| 206 |
+
"_suffix_label": label,
|
| 207 |
+
**{k: v for k, v in tok.items() if k not in ("token", "type")},
|
| 208 |
+
})
|
| 209 |
+
else:
|
| 210 |
+
result.append(tok)
|
| 211 |
+
|
| 212 |
+
return result
|
turk_tokenizer/_tdk_vocab.py
ADDED
|
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Fix 7: TDK-based FOREIGN word detection."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import json
|
| 6 |
+
import os
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
|
| 9 |
+
_CACHE_DIR = Path.home() / ".cache" / "turk_tokenizer"
|
| 10 |
+
_CACHE_DIR.mkdir(parents=True, exist_ok=True)
|
| 11 |
+
TDK_CACHE_FILE = str(_CACHE_DIR / "tdk_words.txt")
|
| 12 |
+
|
| 13 |
+
TR_CHARS = set("çğışöüÇĞİŞÖÜ")
|
| 14 |
+
|
| 15 |
+
_TDK_WORDS: set | None = None
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def load_tdk_words() -> set:
|
| 19 |
+
global _TDK_WORDS
|
| 20 |
+
if _TDK_WORDS is not None:
|
| 21 |
+
return _TDK_WORDS
|
| 22 |
+
|
| 23 |
+
if not os.path.exists(TDK_CACHE_FILE):
|
| 24 |
+
print("[TurkTokenizer] TDK word list not found — downloading automatically...")
|
| 25 |
+
words = download_tdk_words()
|
| 26 |
+
if not words:
|
| 27 |
+
_TDK_WORDS = set()
|
| 28 |
+
return _TDK_WORDS
|
| 29 |
+
|
| 30 |
+
with open(TDK_CACHE_FILE, encoding="utf-8") as f:
|
| 31 |
+
_TDK_WORDS = {line.strip().lower() for line in f if line.strip()}
|
| 32 |
+
return _TDK_WORDS
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
def download_tdk_words() -> list[str]:
|
| 36 |
+
"""Download ~76K Turkish words from the TDK API and cache them."""
|
| 37 |
+
try:
|
| 38 |
+
import urllib.request # noqa: PLC0415
|
| 39 |
+
|
| 40 |
+
url = "https://sozluk.gov.tr/autocomplete.json"
|
| 41 |
+
with urllib.request.urlopen(url, timeout=30) as resp:
|
| 42 |
+
data = json.loads(resp.read().decode("utf-8"))
|
| 43 |
+
|
| 44 |
+
words = sorted({item.get("madde", "").strip().lower() for item in data if item.get("madde")})
|
| 45 |
+
with open(TDK_CACHE_FILE, "w", encoding="utf-8") as f:
|
| 46 |
+
f.write("\n".join(words))
|
| 47 |
+
|
| 48 |
+
print(f"[TurkTokenizer] TDK: {len(words):,} words cached at {TDK_CACHE_FILE}")
|
| 49 |
+
return words
|
| 50 |
+
|
| 51 |
+
except Exception as exc: # noqa: BLE001
|
| 52 |
+
print(f"[TurkTokenizer] TDK download failed: {exc}")
|
| 53 |
+
print(" FOREIGN detection will be disabled for this session.")
|
| 54 |
+
return []
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
def is_foreign_word(word: str) -> bool:
|
| 58 |
+
w = word.strip().lower()
|
| 59 |
+
if not w or len(w) < 2:
|
| 60 |
+
return False
|
| 61 |
+
if any(c in TR_CHARS for c in w):
|
| 62 |
+
return False
|
| 63 |
+
return w not in load_tdk_words()
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def reclassify_foreign_words(tokens: list[dict]) -> list[dict]:
|
| 67 |
+
"""Reclassify word-initial BPE tokens as ROOT if they are foreign words."""
|
| 68 |
+
tdk = load_tdk_words()
|
| 69 |
+
if not tdk:
|
| 70 |
+
return tokens
|
| 71 |
+
|
| 72 |
+
result: list[dict] = []
|
| 73 |
+
for tok in tokens:
|
| 74 |
+
if tok["type"] != "BPE":
|
| 75 |
+
result.append(tok)
|
| 76 |
+
continue
|
| 77 |
+
|
| 78 |
+
raw = tok["token"]
|
| 79 |
+
stripped = raw.lstrip()
|
| 80 |
+
|
| 81 |
+
if raw == stripped: # no leading space → not word-initial
|
| 82 |
+
result.append(tok)
|
| 83 |
+
continue
|
| 84 |
+
|
| 85 |
+
if is_foreign_word(stripped):
|
| 86 |
+
result.append({**tok, "type": "ROOT", "_foreign": True, "_tdk": False})
|
| 87 |
+
else:
|
| 88 |
+
result.append(tok)
|
| 89 |
+
|
| 90 |
+
return result
|
turk_tokenizer/data/zemberek-full.jar
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:74ee8736b73dc2ca878071b80829f9c5acccc268d4b8b7795d36d60db26a1731
|
| 3 |
+
size 31644792
|
turk_tokenizer/tokenizer.py
ADDED
|
@@ -0,0 +1,308 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
TurkTokenizer — production-ready Turkish morphological tokenizer.
|
| 3 |
+
|
| 4 |
+
Applies 12 sequential fixes on top of the base turkish-tokenizer:
|
| 5 |
+
1. ALL CAPS inflation fix
|
| 6 |
+
2. Apostrophe / code-switching split
|
| 7 |
+
3. BPE→SUFFIX reclassification
|
| 8 |
+
4. Zemberek root validation & correction
|
| 9 |
+
5. Punctuation → PUNCT type
|
| 10 |
+
6. Domain vocabulary (medical / sports / tourism)
|
| 11 |
+
7. TDK-based FOREIGN word detection
|
| 12 |
+
8. Special token normalization (NUM, DATE, URL, MENTION, HASHTAG, EMOJI)
|
| 13 |
+
9. Allomorph canonicalization
|
| 14 |
+
10. Compound word decomposition
|
| 15 |
+
11. Acronym expansion
|
| 16 |
+
12. Context-aware Zemberek disambiguation
|
| 17 |
+
|
| 18 |
+
Output fields per token:
|
| 19 |
+
token : str — token string (leading space = word-initial)
|
| 20 |
+
token_type : str — ROOT | SUFFIX | FOREIGN | BPE | PUNCT |
|
| 21 |
+
NUM | DATE | UNIT | URL | MENTION | HASHTAG | EMOJI
|
| 22 |
+
morph_pos : int — 0=root/word-initial, 1=first suffix, 2=second suffix…
|
| 23 |
+
(+ optional _* metadata fields)
|
| 24 |
+
"""
|
| 25 |
+
|
| 26 |
+
from __future__ import annotations
|
| 27 |
+
|
| 28 |
+
import os
|
| 29 |
+
import multiprocessing
|
| 30 |
+
from concurrent.futures import ProcessPoolExecutor, as_completed
|
| 31 |
+
from pathlib import Path
|
| 32 |
+
|
| 33 |
+
from ._java_check import ensure_java
|
| 34 |
+
from ._preprocessor import preprocess, postprocess
|
| 35 |
+
from ._suffix_expander import reclassify_bpe_suffixes
|
| 36 |
+
from ._root_validator import validate_roots, ZEMBEREK_AVAILABLE
|
| 37 |
+
from ._medical_vocab import ALL_DOMAIN_ROOTS
|
| 38 |
+
from ._tdk_vocab import reclassify_foreign_words
|
| 39 |
+
from ._normalizer import (
|
| 40 |
+
preprocess_special_tokens,
|
| 41 |
+
restore_special_tokens,
|
| 42 |
+
reclassify_numbers_in_tokens,
|
| 43 |
+
)
|
| 44 |
+
from ._allomorph import add_canonical_labels
|
| 45 |
+
from ._compound import add_compound_info
|
| 46 |
+
from ._acronym_dict import reclassify_acronyms
|
| 47 |
+
from ._context_aware import annotate_with_context
|
| 48 |
+
|
| 49 |
+
try:
|
| 50 |
+
from ._root_validator import _morphology as _zemb_morphology
|
| 51 |
+
except Exception:
|
| 52 |
+
_zemb_morphology = None
|
| 53 |
+
|
| 54 |
+
_DOMAIN_ROOTS_LOWER = {k.lower() for k in ALL_DOMAIN_ROOTS}
|
| 55 |
+
|
| 56 |
+
# ── Token types ───────────────────────────────────────────────────────────────
|
| 57 |
+
|
| 58 |
+
_SPECIAL_TYPES = frozenset(
|
| 59 |
+
("NUM", "DATE", "UNIT", "URL", "MENTION", "HASHTAG", "EMOJI")
|
| 60 |
+
)
|
| 61 |
+
|
| 62 |
+
_TYPE_SYM = {
|
| 63 |
+
"ROOT": "R", "SUFFIX": "S", "FOREIGN": "F", "BPE": "B", "PUNCT": "P",
|
| 64 |
+
"NUM": "N", "DATE": "D", "UNIT": "U",
|
| 65 |
+
"URL": "L", "MENTION": "@", "HASHTAG": "#", "EMOJI": "E",
|
| 66 |
+
}
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
# ── Parallel worker helpers ───────────────────────────────────────────────────
|
| 70 |
+
|
| 71 |
+
_worker_tok: "TurkTokenizer | None" = None
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
def _init_worker() -> None:
|
| 75 |
+
global _worker_tok
|
| 76 |
+
_worker_tok = TurkTokenizer()
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
def _tokenize_one(text: str) -> list[dict]:
|
| 80 |
+
assert _worker_tok is not None
|
| 81 |
+
return _worker_tok.tokenize(text)
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
# ══════════════════════════════════════════════════════════════════════════════
|
| 85 |
+
|
| 86 |
+
class TurkTokenizer:
|
| 87 |
+
"""
|
| 88 |
+
Turkish morphological tokenizer with HuggingFace-compatible interface.
|
| 89 |
+
|
| 90 |
+
Example::
|
| 91 |
+
|
| 92 |
+
from turk_tokenizer import TurkTokenizer
|
| 93 |
+
|
| 94 |
+
tok = TurkTokenizer()
|
| 95 |
+
tokens = tok("İstanbul'da meeting'e katılamadım")
|
| 96 |
+
for t in tokens:
|
| 97 |
+
print(t["token"], t["token_type"], t["morph_pos"])
|
| 98 |
+
"""
|
| 99 |
+
|
| 100 |
+
def __init__(self) -> None:
|
| 101 |
+
ensure_java()
|
| 102 |
+
from turkish_tokenizer import TurkishTokenizer # noqa: PLC0415
|
| 103 |
+
self._base = TurkishTokenizer()
|
| 104 |
+
self.zemberek_available = ZEMBEREK_AVAILABLE
|
| 105 |
+
|
| 106 |
+
# ── Public API ────────────────────────────────────────────────────────────
|
| 107 |
+
|
| 108 |
+
def __call__(self, text: str) -> list[dict]:
|
| 109 |
+
return self.tokenize(text)
|
| 110 |
+
|
| 111 |
+
def tokenize(self, text: str) -> list[dict]:
|
| 112 |
+
"""Tokenize a single text string.
|
| 113 |
+
|
| 114 |
+
Returns a list of token dicts, each with:
|
| 115 |
+
``token``, ``token_type``, ``morph_pos``, and optional ``_*`` fields.
|
| 116 |
+
"""
|
| 117 |
+
# Fix 8 pre: replace URLs, mentions, numbers etc. with placeholders
|
| 118 |
+
text_norm, specials = preprocess_special_tokens(text)
|
| 119 |
+
|
| 120 |
+
# Fix 1 & 2 pre: ALL CAPS + apostrophe
|
| 121 |
+
processed, caps_map = preprocess(text_norm)
|
| 122 |
+
|
| 123 |
+
# Base tokenizer
|
| 124 |
+
raw = self._base.tokenize_text(processed)
|
| 125 |
+
|
| 126 |
+
# Fix 8 post: restore placeholders
|
| 127 |
+
tokens = restore_special_tokens(raw, specials)
|
| 128 |
+
|
| 129 |
+
# Fix 1 & 2 post
|
| 130 |
+
tokens = postprocess(tokens, caps_map)
|
| 131 |
+
|
| 132 |
+
# Fix 3 + 5: BPE→SUFFIX reclassification + PUNCT
|
| 133 |
+
tokens = reclassify_bpe_suffixes(tokens)
|
| 134 |
+
|
| 135 |
+
# Fix 8b: remaining numbers / units
|
| 136 |
+
tokens = reclassify_numbers_in_tokens(tokens)
|
| 137 |
+
|
| 138 |
+
# Fix 6: domain vocabulary (medical / sports / tourism)
|
| 139 |
+
tokens = _reclassify_domain_roots(tokens, _DOMAIN_ROOTS_LOWER)
|
| 140 |
+
|
| 141 |
+
# Fix 7: TDK FOREIGN detection
|
| 142 |
+
tokens = reclassify_foreign_words(tokens)
|
| 143 |
+
|
| 144 |
+
# Fix 11: acronym expansions
|
| 145 |
+
tokens = reclassify_acronyms(tokens)
|
| 146 |
+
|
| 147 |
+
# Fix 9: allomorph canonical labels
|
| 148 |
+
tokens = add_canonical_labels(tokens)
|
| 149 |
+
|
| 150 |
+
# Fix 10: compound word annotation
|
| 151 |
+
tokens = add_compound_info(tokens, morphology=_zemb_morphology)
|
| 152 |
+
|
| 153 |
+
# Fix 12: context-aware Zemberek disambiguation
|
| 154 |
+
tokens = annotate_with_context(tokens, text)
|
| 155 |
+
|
| 156 |
+
# Fix 4: Zemberek root validation & correction
|
| 157 |
+
tokens = validate_roots(tokens, text.split(), base_tokenizer=self._base)
|
| 158 |
+
|
| 159 |
+
# Add public output fields
|
| 160 |
+
tokens = _add_output_fields(tokens)
|
| 161 |
+
|
| 162 |
+
return tokens
|
| 163 |
+
|
| 164 |
+
def batch_tokenize(
|
| 165 |
+
self,
|
| 166 |
+
texts: list[str],
|
| 167 |
+
workers: int | None = None,
|
| 168 |
+
chunk_size: int = 64,
|
| 169 |
+
) -> list[list[dict]]:
|
| 170 |
+
"""Tokenize a list of texts in parallel.
|
| 171 |
+
|
| 172 |
+
Args:
|
| 173 |
+
texts: List of strings to tokenize.
|
| 174 |
+
workers: Number of worker processes (None = all CPUs).
|
| 175 |
+
chunk_size: Below this count, run sequentially to avoid overhead.
|
| 176 |
+
|
| 177 |
+
Returns:
|
| 178 |
+
List of token lists, in the same order as ``texts``.
|
| 179 |
+
"""
|
| 180 |
+
if not texts:
|
| 181 |
+
return []
|
| 182 |
+
|
| 183 |
+
n = workers or os.cpu_count() or 4
|
| 184 |
+
|
| 185 |
+
if len(texts) <= chunk_size or n == 1:
|
| 186 |
+
return [self.tokenize(t) for t in texts]
|
| 187 |
+
|
| 188 |
+
results: list[list[dict] | None] = [None] * len(texts)
|
| 189 |
+
|
| 190 |
+
with ProcessPoolExecutor(max_workers=n, initializer=_init_worker) as pool:
|
| 191 |
+
futs = {pool.submit(_tokenize_one, t): i for i, t in enumerate(texts)}
|
| 192 |
+
for fut in as_completed(futs):
|
| 193 |
+
i = futs[fut]
|
| 194 |
+
try:
|
| 195 |
+
results[i] = fut.result()
|
| 196 |
+
except Exception as exc: # noqa: BLE001
|
| 197 |
+
results[i] = self._base.tokenize_text(texts[i])
|
| 198 |
+
print(f"[TurkTokenizer] fallback at idx={i}: {exc}")
|
| 199 |
+
|
| 200 |
+
return results # type: ignore[return-value]
|
| 201 |
+
|
| 202 |
+
# ── HuggingFace-style helpers ─────────────────────────────────────────────
|
| 203 |
+
|
| 204 |
+
@classmethod
|
| 205 |
+
def from_pretrained(cls, _model_id: str = "Ethosoft/turk-tokenizer") -> "TurkTokenizer":
|
| 206 |
+
"""Load tokenizer (rules-based, no weights to download)."""
|
| 207 |
+
return cls()
|
| 208 |
+
|
| 209 |
+
def save_pretrained(self, save_directory: str) -> None:
|
| 210 |
+
"""Save tokenizer config to a directory (for HF Hub compatibility)."""
|
| 211 |
+
import json
|
| 212 |
+
path = Path(save_directory)
|
| 213 |
+
path.mkdir(parents=True, exist_ok=True)
|
| 214 |
+
config = {
|
| 215 |
+
"tokenizer_class": "TurkTokenizer",
|
| 216 |
+
"model_type": "turk-tokenizer",
|
| 217 |
+
"version": "1.0.0",
|
| 218 |
+
"zemberek_available": self.zemberek_available,
|
| 219 |
+
}
|
| 220 |
+
(path / "tokenizer_config.json").write_text(
|
| 221 |
+
json.dumps(config, ensure_ascii=False, indent=2), encoding="utf-8"
|
| 222 |
+
)
|
| 223 |
+
|
| 224 |
+
# ── Utility ───────────────────────────────────────────────────────────────
|
| 225 |
+
|
| 226 |
+
def stats(self, tokens: list[dict]) -> dict:
|
| 227 |
+
"""Compute morphological coverage statistics for a token list."""
|
| 228 |
+
total = len(tokens)
|
| 229 |
+
if total == 0:
|
| 230 |
+
return {k: 0 for k in ("total", "roots", "suffixes", "foreign",
|
| 231 |
+
"bpe", "punct", "special", "tr_pct", "pure_pct")}
|
| 232 |
+
roots = sum(1 for t in tokens if t["token_type"] == "ROOT")
|
| 233 |
+
suffixes = sum(1 for t in tokens if t["token_type"] == "SUFFIX")
|
| 234 |
+
foreign = sum(1 for t in tokens if t["token_type"] == "FOREIGN")
|
| 235 |
+
punct = sum(1 for t in tokens if t["token_type"] == "PUNCT")
|
| 236 |
+
bpe = sum(1 for t in tokens if t["token_type"] == "BPE")
|
| 237 |
+
special = sum(1 for t in tokens if t["token_type"] in _SPECIAL_TYPES)
|
| 238 |
+
tr = roots + suffixes + foreign + punct + special
|
| 239 |
+
pure = sum(
|
| 240 |
+
1 for t in tokens
|
| 241 |
+
if t["token_type"] in ("ROOT", "SUFFIX", "FOREIGN")
|
| 242 |
+
and not t["token"].strip().startswith("<")
|
| 243 |
+
)
|
| 244 |
+
return {
|
| 245 |
+
"total": total,
|
| 246 |
+
"roots": roots,
|
| 247 |
+
"suffixes": suffixes,
|
| 248 |
+
"foreign": foreign,
|
| 249 |
+
"bpe": bpe,
|
| 250 |
+
"punct": punct,
|
| 251 |
+
"special": special,
|
| 252 |
+
"tr_pct": round(tr / total * 100, 2),
|
| 253 |
+
"pure_pct": round(pure / total * 100, 2),
|
| 254 |
+
}
|
| 255 |
+
|
| 256 |
+
|
| 257 |
+
# ── Internal helpers ──────────────────────────────────────────────────────────
|
| 258 |
+
|
| 259 |
+
def _reclassify_domain_roots(tokens: list[dict], domain_lower: set) -> list[dict]:
|
| 260 |
+
result = []
|
| 261 |
+
for tok in tokens:
|
| 262 |
+
if tok["type"] != "BPE":
|
| 263 |
+
result.append(tok)
|
| 264 |
+
continue
|
| 265 |
+
raw = tok["token"]
|
| 266 |
+
if raw == raw.lstrip(): # no leading space → not word-initial
|
| 267 |
+
result.append(tok)
|
| 268 |
+
continue
|
| 269 |
+
if raw.lstrip().lower() in domain_lower:
|
| 270 |
+
result.append({**tok, "type": "ROOT", "_domain": True})
|
| 271 |
+
else:
|
| 272 |
+
result.append(tok)
|
| 273 |
+
return result
|
| 274 |
+
|
| 275 |
+
|
| 276 |
+
def _add_output_fields(tokens: list[dict]) -> list[dict]:
|
| 277 |
+
"""Compute token_type and morph_pos and add them to every token."""
|
| 278 |
+
result = []
|
| 279 |
+
word_pos = 0
|
| 280 |
+
|
| 281 |
+
for tok in tokens:
|
| 282 |
+
raw = tok["token"]
|
| 283 |
+
base_type = tok["type"]
|
| 284 |
+
stripped = raw.strip()
|
| 285 |
+
|
| 286 |
+
# ── token_type: FOREIGN for foreign ROOTs ─────────────────────────
|
| 287 |
+
if base_type == "ROOT" and tok.get("_foreign"):
|
| 288 |
+
token_type = "FOREIGN"
|
| 289 |
+
else:
|
| 290 |
+
token_type = base_type
|
| 291 |
+
|
| 292 |
+
# ── morph_pos ─────────────────────────────────────────────────────
|
| 293 |
+
is_word_start = raw.startswith(" ") or stripped.startswith("<")
|
| 294 |
+
|
| 295 |
+
if is_word_start or base_type in _SPECIAL_TYPES or base_type == "PUNCT":
|
| 296 |
+
word_pos = 0
|
| 297 |
+
morph_pos = 0
|
| 298 |
+
elif base_type == "SUFFIX":
|
| 299 |
+
word_pos += 1
|
| 300 |
+
morph_pos = word_pos
|
| 301 |
+
else:
|
| 302 |
+
# ROOT or BPE within a word (no leading space)
|
| 303 |
+
word_pos = 0
|
| 304 |
+
morph_pos = 0
|
| 305 |
+
|
| 306 |
+
result.append({**tok, "token_type": token_type, "morph_pos": morph_pos})
|
| 307 |
+
|
| 308 |
+
return result
|