nmstech Claude Opus 4.6 commited on Mar 18

Commit

cfffd93

1 Parent(s): e430fca

Rename project from TurkTokenizer to NedoTurkishTokenizer

- Rename module directory: turk_tokenizer/ -> nedo_turkish_tokenizer/
- Rename HF wrapper: tokenization_turk.py -> tokenization_nedo_turkish.py
- Update class name: TurkTokenizer -> NedoTurkishTokenizer
- Update PyPI package name: turk-tokenizer -> nedo-turkish-tokenizer
- Update all HuggingFace URLs to Ethosoft/NedoTurkishTokenizer
- Update log messages, cache paths, and config references

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

Files changed (21) hide show

.gitattributes +1 -1
README.md +26 -26
{turk_tokenizer → nedo_turkish_tokenizer}/__init__.py +5 -5
{turk_tokenizer → nedo_turkish_tokenizer}/_acronym_dict.py +0 -0
{turk_tokenizer → nedo_turkish_tokenizer}/_allomorph.py +0 -0
{turk_tokenizer → nedo_turkish_tokenizer}/_compound.py +0 -0
{turk_tokenizer → nedo_turkish_tokenizer}/_context_aware.py +0 -0
{turk_tokenizer → nedo_turkish_tokenizer}/_java_check.py +1 -1
{turk_tokenizer → nedo_turkish_tokenizer}/_medical_vocab.py +0 -0
{turk_tokenizer → nedo_turkish_tokenizer}/_normalizer.py +0 -0
{turk_tokenizer → nedo_turkish_tokenizer}/_preprocessor.py +0 -0
{turk_tokenizer → nedo_turkish_tokenizer}/_root_validator.py +3 -3
{turk_tokenizer → nedo_turkish_tokenizer}/_suffix_expander.py +0 -0
{turk_tokenizer → nedo_turkish_tokenizer}/_tdk_vocab.py +9 -9
{turk_tokenizer → nedo_turkish_tokenizer}/data/tdk_words.txt +0 -0
{turk_tokenizer → nedo_turkish_tokenizer}/data/turkish_proper_nouns.txt +0 -0
{turk_tokenizer → nedo_turkish_tokenizer}/data/zemberek-full.jar +0 -0
{turk_tokenizer → nedo_turkish_tokenizer}/tokenizer.py +10 -10
pyproject.toml +5 -5
tokenization_turk.py → tokenization_nedo_turkish.py +6 -6
tokenizer_config.json +3 -3

.gitattributes CHANGED Viewed

@@ -33,4 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
-turk_tokenizer/data/zemberek-full.jar filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+nedo_turkish_tokenizer/data/zemberek-full.jar filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -8,15 +8,15 @@ tags:
 - nlp
 - transformers
 license: mit
-library_name: turk-tokenizer
 pipeline_tag: token-classification
 ---
-# TurkTokenizer
 **Turkish morphological tokenizer — TR-MMLU world record 95.45%**
-TurkTokenizer performs linguistically-aware tokenization of Turkish text using morphological rules. Unlike BPE-based tokenizers, it produces meaningful morphological units (roots and suffixes) aligned with Turkish grammar, powered by [Zemberek NLP](https://github.com/ahmetaa/zemberek-nlp).
 ## Model Details
@@ -35,7 +35,7 @@ TurkTokenizer performs linguistically-aware tokenization of Turkish text using m
 ### Installation
 ```bash
-pip install git+https://huggingface.co/Ethosoft/turk-tokenizer
 ```
 > **Java is required** for Zemberek morphological analysis.
@@ -50,12 +50,12 @@ pip install git+https://huggingface.co/Ethosoft/turk-tokenizer
 ---
-### With 🤗 Transformers (`AutoTokenizer`)
 ```python
 from transformers import AutoTokenizer
-tok = AutoTokenizer.from_pretrained("Ethosoft/turk-tokenizer", trust_remote_code=True)
 out = tok("Türk dili, morfolojik açıdan zengin bir dildir.")
 print(out["input_ids"])            # hash-stable int IDs
@@ -69,7 +69,7 @@ for t in out["morphological_tokens"]:
 **Batch tokenization:**
 ```python
 out = tok(["Türkçe metin.", "Another sentence with code-switching."])
-# out["input_ids"]  → list of lists
 ```
 **Direct morphological tokenization:**
@@ -79,7 +79,7 @@ for t in tokens:
     print(f"{t['token']:20s} {t['token_type']:8s} pos={t['morph_pos']}", end="")
     if t.get("_canonical"):   print(f"  [{t['_canonical']}]", end="")
     if t.get("_compound"):    print(f"  compound={t['_parts']}", end="")
-    if t.get("_expansion"):   print(f"  → {t['_expansion']}", end="")
     print()
 ```
@@ -88,9 +88,9 @@ for t in tokens:
 ### Standalone (without Transformers)
 ```python
-from turk_tokenizer import TurkTokenizer
-tok = TurkTokenizer()
 # Single text
 tokens = tok("İSTANBUL'da meeting'e katılamadım")
@@ -132,7 +132,7 @@ Every token dict contains:
 |---|---|---|
 | `token` | `str` | Token string — leading space means word-initial |
 | `token_type` | `str` | Morphological type (see table below) |
-| `morph_pos` | `int` | Position within word: `0`=root, `1`=1st suffix, `2`=2nd suffix… |
 ### Token Types
@@ -149,42 +149,42 @@ Every token dict contains:
 | `URL` | Web address | `https://...` |
 | `MENTION` | @username | `@ethosoft` |
 | `HASHTAG` | #topic | `#NLP` |
-| `EMOJI` | Emoji | `😊` |
 ### Optional Metadata Fields
 | Field | Description |
 |---|---|
-| `_canonical` | Canonical morpheme: `"lar"/"ler"` → `"PL"`, `"dan"/"den"` → `"ABL"` |
-| `_suffix_label` | Detailed morphological label: `-PL+ACC`, `-P3+LOC`, … |
 | `_foreign` | `True` — foreign root detected by TDK lookup |
 | `_caps` | `True` — originally ALL CAPS word |
 | `_domain` | `True` — medical / sports / tourism domain word |
 | `_compound` | `True` — compound word (e.g. `başbakan`) |
 | `_parts` | Compound parts: `["baş", "bakan"]` |
-| `_expansion` | Acronym expansion: `"CMV"` → `"Sitomegalovirüs"` |
-| `_pos` | POS tag from Zemberek: `Noun`, `Verb`, `Adj`, `Num`… |
-| `_lemma` | Lemma from Zemberek: `"gelir"` → `"gelmek"` (when verb) |
-| `_disambiguated` | `True` — context disambiguation applied (`"yüz"`, `"gelir"`…) |
-| `_root_corrected` | `True` — phonetic root correction: `"gök"` → `"göğüs"` |
 ---
 ## How It Works
-TurkTokenizer wraps the base `turkish-tokenizer` BPE model with **12 sequential morphological fixes**:
 | Fix | Problem | Solution |
 |---|---|---|
-| 1 | `İSTANBUL` → 16 BPE tokens | Lowercase before tokenization, restore `<uppercase_word>` marker |
-| 2 | `meeting'e` → broken BPE | Detect foreign base + Turkish suffix, split at apostrophe |
-| 3 | Turkish suffixes classified as BPE | 260+ suffix patterns reclassified → SUFFIX |
 | 4 | Wrong roots (`gök` for `göğüs`) | Zemberek phonetic root validation & correction |
 | 5 | Punctuation counted as BPE | Classify as PUNCT |
 | 6 | Medical/domain terms as BPE | 500+ medical, sports, tourism root vocabulary |
-| 7 | Foreign words as BPE | TDK 76K+ word lookup → FOREIGN ROOT |
 | 8 | Numbers, URLs, mentions fragmented | Pre-tokenization placeholder normalization |
-| 9 | `lar`/`ler` different IDs for same morpheme | Allomorph canonicalization (`PL`, `ACC`, `DAT`…) |
 | 10 | `başbakan` as single unknown ROOT | Compound word decomposition |
 | 11 | `CMV`, `NATO` without meaning | Acronym expansion dictionary (100+ entries) |
 | 12 | `yüz` = 100 or face or swim? | Zemberek sentence-level context disambiguation |
@@ -193,4 +193,4 @@ TurkTokenizer wraps the base `turkish-tokenizer` BPE model with **12 sequential
 ## License
-MIT © [Ethosoft](https://huggingface.co/Ethosoft)

 - nlp
 - transformers
 license: mit
+library_name: nedo-turkish-tokenizer
 pipeline_tag: token-classification
 ---
+# NedoTurkishTokenizer
 **Turkish morphological tokenizer — TR-MMLU world record 95.45%**
+NedoTurkishTokenizer performs linguistically-aware tokenization of Turkish text using morphological rules. Unlike BPE-based tokenizers, it produces meaningful morphological units (roots and suffixes) aligned with Turkish grammar, powered by [Zemberek NLP](https://github.com/ahmetaa/zemberek-nlp).
 ## Model Details
 ### Installation
 ```bash
+pip install git+https://huggingface.co/Ethosoft/NedoTurkishTokenizer
 ```
 > **Java is required** for Zemberek morphological analysis.
 ---
+### With Transformers (`AutoTokenizer`)
 ```python
 from transformers import AutoTokenizer
+tok = AutoTokenizer.from_pretrained("Ethosoft/NedoTurkishTokenizer", trust_remote_code=True)
 out = tok("Türk dili, morfolojik açıdan zengin bir dildir.")
 print(out["input_ids"])            # hash-stable int IDs
 **Batch tokenization:**
 ```python
 out = tok(["Türkçe metin.", "Another sentence with code-switching."])
+# out["input_ids"]  -> list of lists
 ```
 **Direct morphological tokenization:**
     print(f"{t['token']:20s} {t['token_type']:8s} pos={t['morph_pos']}", end="")
     if t.get("_canonical"):   print(f"  [{t['_canonical']}]", end="")
     if t.get("_compound"):    print(f"  compound={t['_parts']}", end="")
+    if t.get("_expansion"):   print(f"  -> {t['_expansion']}", end="")
     print()
 ```
 ### Standalone (without Transformers)
 ```python
+from nedo_turkish_tokenizer import NedoTurkishTokenizer
+tok = NedoTurkishTokenizer()
 # Single text
 tokens = tok("İSTANBUL'da meeting'e katılamadım")
 |---|---|---|
 | `token` | `str` | Token string — leading space means word-initial |
 | `token_type` | `str` | Morphological type (see table below) |
+| `morph_pos` | `int` | Position within word: `0`=root, `1`=1st suffix, `2`=2nd suffix... |
 ### Token Types
 | `URL` | Web address | `https://...` |
 | `MENTION` | @username | `@ethosoft` |
 | `HASHTAG` | #topic | `#NLP` |
+| `EMOJI` | Emoji | |
 ### Optional Metadata Fields
 | Field | Description |
 |---|---|
+| `_canonical` | Canonical morpheme: `"lar"/"ler"` -> `"PL"`, `"dan"/"den"` -> `"ABL"` |
+| `_suffix_label` | Detailed morphological label: `-PL+ACC`, `-P3+LOC`, ... |
 | `_foreign` | `True` — foreign root detected by TDK lookup |
 | `_caps` | `True` — originally ALL CAPS word |
 | `_domain` | `True` — medical / sports / tourism domain word |
 | `_compound` | `True` — compound word (e.g. `başbakan`) |
 | `_parts` | Compound parts: `["baş", "bakan"]` |
+| `_expansion` | Acronym expansion: `"CMV"` -> `"Sitomegalovirüs"` |
+| `_pos` | POS tag from Zemberek: `Noun`, `Verb`, `Adj`, `Num`... |
+| `_lemma` | Lemma from Zemberek: `"gelir"` -> `"gelmek"` (when verb) |
+| `_disambiguated` | `True` — context disambiguation applied (`"yüz"`, `"gelir"`...) |
+| `_root_corrected` | `True` — phonetic root correction: `"gök"` -> `"göğüs"` |
 ---
 ## How It Works
+NedoTurkishTokenizer wraps the base `turkish-tokenizer` BPE model with **12 sequential morphological fixes**:
 | Fix | Problem | Solution |
 |---|---|---|
+| 1 | `İSTANBUL` -> 16 BPE tokens | Lowercase before tokenization, restore `<uppercase_word>` marker |
+| 2 | `meeting'e` -> broken BPE | Detect foreign base + Turkish suffix, split at apostrophe |
+| 3 | Turkish suffixes classified as BPE | 260+ suffix patterns reclassified -> SUFFIX |
 | 4 | Wrong roots (`gök` for `göğüs`) | Zemberek phonetic root validation & correction |
 | 5 | Punctuation counted as BPE | Classify as PUNCT |
 | 6 | Medical/domain terms as BPE | 500+ medical, sports, tourism root vocabulary |
+| 7 | Foreign words as BPE | TDK 76K+ word lookup -> FOREIGN ROOT |
 | 8 | Numbers, URLs, mentions fragmented | Pre-tokenization placeholder normalization |
+| 9 | `lar`/`ler` different IDs for same morpheme | Allomorph canonicalization (`PL`, `ACC`, `DAT`...) |
 | 10 | `başbakan` as single unknown ROOT | Compound word decomposition |
 | 11 | `CMV`, `NATO` without meaning | Acronym expansion dictionary (100+ entries) |
 | 12 | `yüz` = 100 or face or swim? | Zemberek sentence-level context disambiguation |
 ## License
+MIT (c) [Ethosoft](https://huggingface.co/Ethosoft)

{turk_tokenizer → nedo_turkish_tokenizer}/__init__.py RENAMED Viewed

@@ -1,11 +1,11 @@
 """
-TurkTokenizer — Turkish morphological tokenizer.
 TR-MMLU world record: 92%
 Usage:
-    from turk_tokenizer import TurkTokenizer
-    tok = TurkTokenizer()
     tokens = tok("İstanbul'da meeting'e katılamadım")
     # Each token dict contains:
@@ -15,7 +15,7 @@ Usage:
     #   morph_pos  : int   — 0=root/word-initial, 1=first suffix, 2=second...
 """
-from .tokenizer import TurkTokenizer
-__all__ = ["TurkTokenizer"]
 __version__ = "1.0.0"

 """
+NedoTurkishTokenizer — Turkish morphological tokenizer.
 TR-MMLU world record: 92%
 Usage:
+    from nedo_turkish_tokenizer import NedoTurkishTokenizer
+    tok = NedoTurkishTokenizer()
     tokens = tok("İstanbul'da meeting'e katılamadım")
     # Each token dict contains:
     #   morph_pos  : int   — 0=root/word-initial, 1=first suffix, 2=second...
 """
+from .tokenizer import NedoTurkishTokenizer
+__all__ = ["NedoTurkishTokenizer"]
 __version__ = "1.0.0"

{turk_tokenizer → nedo_turkish_tokenizer}/_acronym_dict.py RENAMED Viewed

File without changes

{turk_tokenizer → nedo_turkish_tokenizer}/_allomorph.py RENAMED Viewed

File without changes

{turk_tokenizer → nedo_turkish_tokenizer}/_compound.py RENAMED Viewed

File without changes

{turk_tokenizer → nedo_turkish_tokenizer}/_context_aware.py RENAMED Viewed

File without changes

{turk_tokenizer → nedo_turkish_tokenizer}/_java_check.py RENAMED Viewed

@@ -24,7 +24,7 @@ def ensure_java() -> None:
     raise RuntimeError(
         "\n"
         "╔══════════════════════════════════════════════════════════════╗\n"
-        "║  TurkTokenizer requires Java (JVM) — not found on this system  ║\n"
         "╠══════════════════════════════════════════════════════════════╣\n"
         f"║  Install Java with:                                          ║\n"
         f"║    {_install_cmd:<58}║\n"

     raise RuntimeError(
         "\n"
         "╔══════════════════════════════════════════════════════════════╗\n"
+        "║  NedoTurkishTokenizer requires Java (JVM) — not found on this system  ║\n"
         "╠══════════════════════════════════════════════════════════════╣\n"
         f"║  Install Java with:                                          ║\n"
         f"║    {_install_cmd:<58}║\n"

{turk_tokenizer → nedo_turkish_tokenizer}/_medical_vocab.py RENAMED Viewed

File without changes

{turk_tokenizer → nedo_turkish_tokenizer}/_normalizer.py RENAMED Viewed

File without changes

{turk_tokenizer → nedo_turkish_tokenizer}/_preprocessor.py RENAMED Viewed

File without changes

{turk_tokenizer → nedo_turkish_tokenizer}/_root_validator.py RENAMED Viewed

@@ -19,7 +19,7 @@ def _init_zemberek() -> None:
     if not JAR_PATH.exists():
         print(
-            f"[TurkTokenizer] zemberek-full.jar not found at {JAR_PATH}\n"
             "  Root validation disabled — morphological fixes will be limited."
         )
         return
@@ -40,9 +40,9 @@ def _init_zemberek() -> None:
         ZEMBEREK_AVAILABLE = True
     except ImportError:
-        print("[TurkTokenizer] jpype1 not installed → pip install jpype1")
     except Exception as exc:  # noqa: BLE001
-        print(f"[TurkTokenizer] Zemberek init failed: {exc}")
 _init_zemberek()

     if not JAR_PATH.exists():
         print(
+            f"[NedoTurkishTokenizer] zemberek-full.jar not found at {JAR_PATH}\n"
             "  Root validation disabled — morphological fixes will be limited."
         )
         return
         ZEMBEREK_AVAILABLE = True
     except ImportError:
+        print("[NedoTurkishTokenizer] jpype1 not installed → pip install jpype1")
     except Exception as exc:  # noqa: BLE001
+        print(f"[NedoTurkishTokenizer] Zemberek init failed: {exc}")
 _init_zemberek()

{turk_tokenizer → nedo_turkish_tokenizer}/_suffix_expander.py RENAMED Viewed

File without changes

{turk_tokenizer → nedo_turkish_tokenizer}/_tdk_vocab.py RENAMED Viewed

@@ -6,7 +6,7 @@ import json
 import os
 from pathlib import Path
-_CACHE_DIR = Path.home() / ".cache" / "turk_tokenizer"
 _CACHE_DIR.mkdir(parents=True, exist_ok=True)
 TDK_CACHE_FILE = str(_CACHE_DIR / "tdk_words.txt")
@@ -16,8 +16,8 @@ _TDK_WORDS: set | None = None
 _HF_TDK_URL = (
-    "https://huggingface.co/Ethosoft/turk-tokenizer/resolve/main"
-    "/turk_tokenizer/data/tdk_words.txt"
 )
@@ -27,7 +27,7 @@ def load_tdk_words() -> set:
         return _TDK_WORDS
     if not os.path.exists(TDK_CACHE_FILE):
-        print("[TurkTokenizer] TDK word list not found — downloading...")
         words = _download_from_hf() or _download_from_tdk()
         if not words:
             _TDK_WORDS = set()
@@ -35,7 +35,7 @@ def load_tdk_words() -> set:
     with open(TDK_CACHE_FILE, encoding="utf-8") as f:
         _TDK_WORDS = {line.strip().lower() for line in f if line.strip()}
-    print(f"[TurkTokenizer] TDK: {len(_TDK_WORDS):,} words loaded ✓")
     return _TDK_WORDS
@@ -51,11 +51,11 @@ def _download_from_hf() -> list[str]:
         with open(TDK_CACHE_FILE, "w", encoding="utf-8") as f:
             f.write("\n".join(words))
-        print(f"[TurkTokenizer] TDK: {len(words):,} words downloaded from HuggingFace ✓")
         return words
     except Exception as exc:  # noqa: BLE001
-        print(f"[TurkTokenizer] HuggingFace download failed: {exc} — trying TDK API...")
         return []
@@ -72,11 +72,11 @@ def _download_from_tdk() -> list[str]:
         with open(TDK_CACHE_FILE, "w", encoding="utf-8") as f:
             f.write("\n".join(words))
-        print(f"[TurkTokenizer] TDK: {len(words):,} words downloaded from TDK API ✓")
         return words
     except Exception as exc:  # noqa: BLE001
-        print(f"[TurkTokenizer] TDK API also failed: {exc}")
         print("  FOREIGN detection will be disabled for this session.")
         return []

 import os
 from pathlib import Path
+_CACHE_DIR = Path.home() / ".cache" / "nedo_turkish_tokenizer"
 _CACHE_DIR.mkdir(parents=True, exist_ok=True)
 TDK_CACHE_FILE = str(_CACHE_DIR / "tdk_words.txt")
 _HF_TDK_URL = (
+    "https://huggingface.co/Ethosoft/NedoTurkishTokenizer/resolve/main"
+    "/nedo_turkish_tokenizer/data/tdk_words.txt"
 )
         return _TDK_WORDS
     if not os.path.exists(TDK_CACHE_FILE):
+        print("[NedoTurkishTokenizer] TDK word list not found — downloading...")
         words = _download_from_hf() or _download_from_tdk()
         if not words:
             _TDK_WORDS = set()
     with open(TDK_CACHE_FILE, encoding="utf-8") as f:
         _TDK_WORDS = {line.strip().lower() for line in f if line.strip()}
+    print(f"[NedoTurkishTokenizer] TDK: {len(_TDK_WORDS):,} words loaded ✓")
     return _TDK_WORDS
         with open(TDK_CACHE_FILE, "w", encoding="utf-8") as f:
             f.write("\n".join(words))
+        print(f"[NedoTurkishTokenizer] TDK: {len(words):,} words downloaded from HuggingFace ✓")
         return words
     except Exception as exc:  # noqa: BLE001
+        print(f"[NedoTurkishTokenizer] HuggingFace download failed: {exc} — trying TDK API...")
         return []
         with open(TDK_CACHE_FILE, "w", encoding="utf-8") as f:
             f.write("\n".join(words))
+        print(f"[NedoTurkishTokenizer] TDK: {len(words):,} words downloaded from TDK API ✓")
         return words
     except Exception as exc:  # noqa: BLE001
+        print(f"[NedoTurkishTokenizer] TDK API also failed: {exc}")
         print("  FOREIGN detection will be disabled for this session.")
         return []

{turk_tokenizer → nedo_turkish_tokenizer}/data/tdk_words.txt RENAMED Viewed

File without changes

{turk_tokenizer → nedo_turkish_tokenizer}/data/turkish_proper_nouns.txt RENAMED Viewed

File without changes

{turk_tokenizer → nedo_turkish_tokenizer}/data/zemberek-full.jar RENAMED Viewed

File without changes

{turk_tokenizer → nedo_turkish_tokenizer}/tokenizer.py RENAMED Viewed

@@ -1,5 +1,5 @@
 """
-TurkTokenizer — production-ready Turkish morphological tokenizer.
 Applies 12 sequential fixes on top of the base turkish-tokenizer:
   1.  ALL CAPS inflation fix
@@ -68,12 +68,12 @@ _TYPE_SYM = {
 # ── Parallel worker helpers ───────────────────────────────────────────────────
-_worker_tok: "TurkTokenizer | None" = None
 def _init_worker() -> None:
     global _worker_tok
-    _worker_tok = TurkTokenizer()
 def _tokenize_one(text: str) -> list[dict]:
@@ -83,15 +83,15 @@ def _tokenize_one(text: str) -> list[dict]:
 # ══════════════════════════════════════════════════════════════════════════════
-class TurkTokenizer:
     """
     Turkish morphological tokenizer with HuggingFace-compatible interface.
     Example::
-        from turk_tokenizer import TurkTokenizer
-        tok = TurkTokenizer()
         tokens = tok("İstanbul'da meeting'e katılamadım")
         for t in tokens:
             print(t["token"], t["token_type"], t["morph_pos"])
@@ -210,14 +210,14 @@ class TurkTokenizer:
                     results[i] = fut.result()
                 except Exception as exc:  # noqa: BLE001
                     results[i] = self._base.tokenize_text(texts[i])
-                    print(f"[TurkTokenizer] fallback at idx={i}: {exc}")
         return results  # type: ignore[return-value]
     # ── HuggingFace-style helpers ─────────────────────────────────────────────
     @classmethod
-    def from_pretrained(cls, _model_id: str = "Ethosoft/turk-tokenizer") -> "TurkTokenizer":
         """Load tokenizer (rules-based, no weights to download)."""
         return cls()
@@ -227,8 +227,8 @@ class TurkTokenizer:
         path = Path(save_directory)
         path.mkdir(parents=True, exist_ok=True)
         config = {
-            "tokenizer_class": "TurkTokenizer",
-            "model_type": "turk-tokenizer",
             "version": "1.0.0",
             "zemberek_available": self.zemberek_available,
         }

 """
+NedoTurkishTokenizer — production-ready Turkish morphological tokenizer.
 Applies 12 sequential fixes on top of the base turkish-tokenizer:
   1.  ALL CAPS inflation fix
 # ── Parallel worker helpers ───────────────────────────────────────────────────
+_worker_tok: "NedoTurkishTokenizer | None" = None
 def _init_worker() -> None:
     global _worker_tok
+    _worker_tok = NedoTurkishTokenizer()
 def _tokenize_one(text: str) -> list[dict]:
 # ══════════════════════════════════════════════════════════════════════════════
+class NedoTurkishTokenizer:
     """
     Turkish morphological tokenizer with HuggingFace-compatible interface.
     Example::
+        from nedo_turkish_tokenizer import NedoTurkishTokenizer
+        tok = NedoTurkishTokenizer()
         tokens = tok("İstanbul'da meeting'e katılamadım")
         for t in tokens:
             print(t["token"], t["token_type"], t["morph_pos"])
                     results[i] = fut.result()
                 except Exception as exc:  # noqa: BLE001
                     results[i] = self._base.tokenize_text(texts[i])
+                    print(f"[NedoTurkishTokenizer] fallback at idx={i}: {exc}")
         return results  # type: ignore[return-value]
     # ── HuggingFace-style helpers ─────────────────────────────────────────────
     @classmethod
+    def from_pretrained(cls, _model_id: str = "Ethosoft/NedoTurkishTokenizer") -> "NedoTurkishTokenizer":
         """Load tokenizer (rules-based, no weights to download)."""
         return cls()
         path = Path(save_directory)
         path.mkdir(parents=True, exist_ok=True)
         config = {
+            "tokenizer_class": "NedoTurkishTokenizer",
+            "model_type": "nedo-turkish-tokenizer",
             "version": "1.0.0",
             "zemberek_available": self.zemberek_available,
         }

pyproject.toml CHANGED Viewed

@@ -3,7 +3,7 @@ requires = ["setuptools>=61", "wheel"]
 build-backend = "setuptools.build_meta"
 [project]
-name = "turk-tokenizer"
 version = "1.0.0"
 description = "Turkish morphological tokenizer — TR-MMLU world record %92"
 readme = "README.md"
@@ -28,12 +28,12 @@ dependencies = [
 dev = ["pytest", "huggingface_hub"]
 [project.urls]
-Homepage = "https://huggingface.co/Ethosoft/turk-tokenizer"
-Repository = "https://huggingface.co/Ethosoft/turk-tokenizer"
 [tool.setuptools.packages.find]
 where = ["."]
-include = ["turk_tokenizer*"]
 [tool.setuptools.package-data]
-turk_tokenizer = ["data/*.jar"]

 build-backend = "setuptools.build_meta"
 [project]
+name = "nedo-turkish-tokenizer"
 version = "1.0.0"
 description = "Turkish morphological tokenizer — TR-MMLU world record %92"
 readme = "README.md"
 dev = ["pytest", "huggingface_hub"]
 [project.urls]
+Homepage = "https://huggingface.co/Ethosoft/NedoTurkishTokenizer"
+Repository = "https://huggingface.co/Ethosoft/NedoTurkishTokenizer"
 [tool.setuptools.packages.find]
 where = ["."]
+include = ["nedo_turkish_tokenizer*"]
 [tool.setuptools.package-data]
+nedo_turkish_tokenizer = ["data/*.jar"]

tokenization_turk.py → tokenization_nedo_turkish.py RENAMED Viewed

@@ -1,10 +1,10 @@
 """
-TurkTokenizer — HuggingFace AutoTokenizer compatible class.
 Usage:
     from transformers import AutoTokenizer
-    tok = AutoTokenizer.from_pretrained("Ethosoft/turk-tokenizer", trust_remote_code=True)
     out = tok("İstanbul'da meeting'e katılamadım")
     out["input_ids"]            # hash-stable int IDs of morphological tokens
@@ -42,7 +42,7 @@ def _stable_hash(s: str) -> int:
     return int(hashlib.md5(s.encode("utf-8")).hexdigest()[:6], 16)
-class TurkTokenizer(PreTrainedTokenizer):
     """
     Turkish morphological tokenizer — HuggingFace compatible.
@@ -62,11 +62,11 @@ class TurkTokenizer(PreTrainedTokenizer):
     def __init__(self, **kwargs: Any) -> None:
         super().__init__(**kwargs)
-        self._morph: "TurkTokenizer_core | None" = None  # lazy init
     def _get_morph(self):
         if self._morph is None:
-            from turk_tokenizer import TurkTokenizer as _Core  # noqa: PLC0415
             self._morph = _Core()
         return self._morph
@@ -160,7 +160,7 @@ class TurkTokenizer(PreTrainedTokenizer):
         return self._tokenize(text)
     def morphological_tokenize(self, text: str) -> list[dict]:
-        """Return full morphological token dicts (main TurkTokenizer output)."""
         return self._get_morph().tokenize(text)
     def batch_tokenize(self, texts: list[str], workers: int | None = None) -> list[list[dict]]:

 """
+NedoTurkishTokenizer — HuggingFace AutoTokenizer compatible class.
 Usage:
     from transformers import AutoTokenizer
+    tok = AutoTokenizer.from_pretrained("Ethosoft/NedoTurkishTokenizer", trust_remote_code=True)
     out = tok("İstanbul'da meeting'e katılamadım")
     out["input_ids"]            # hash-stable int IDs of morphological tokens
     return int(hashlib.md5(s.encode("utf-8")).hexdigest()[:6], 16)
+class NedoTurkishTokenizer(PreTrainedTokenizer):
     """
     Turkish morphological tokenizer — HuggingFace compatible.
     def __init__(self, **kwargs: Any) -> None:
         super().__init__(**kwargs)
+        self._morph: "NedoTurkishTokenizer_core | None" = None  # lazy init
     def _get_morph(self):
         if self._morph is None:
+            from nedo_turkish_tokenizer import NedoTurkishTokenizer as _Core  # noqa: PLC0415
             self._morph = _Core()
         return self._morph
         return self._tokenize(text)
     def morphological_tokenize(self, text: str) -> list[dict]:
+        """Return full morphological token dicts (main NedoTurkishTokenizer output)."""
         return self._get_morph().tokenize(text)
     def batch_tokenize(self, texts: list[str], workers: int | None = None) -> list[list[dict]]:

tokenizer_config.json CHANGED Viewed

@@ -1,8 +1,8 @@
 {
-  "tokenizer_class": "TurkTokenizer",
-  "model_type": "turk-tokenizer",
   "auto_map": {
-    "AutoTokenizer": ["tokenization_turk.TurkTokenizer", null]
   },
   "version": "1.0.0",
   "language": "tr",

 {
+  "tokenizer_class": "NedoTurkishTokenizer",
+  "model_type": "nedo-turkish-tokenizer",
   "auto_map": {
+    "AutoTokenizer": ["tokenization_nedo_turkish.NedoTurkishTokenizer", null]
   },
   "version": "1.0.0",
   "language": "tr",