NedoTurkishTokenizer / tests /test_tokenizer.py
Ethosoft
Refactor to standalone v2.0: zero dependencies, internal engine, removed zemberek/hf wrapper
edec8b7
"""Comprehensive regression test suite for NedoTurkishTokenizer.
Tests the public API and core segmentation with gold-standard examples
covering: basic Turkish, suffix chains, apostrophes, foreign words,
acronyms, special spans, ALL CAPS, compound words, and edge cases.
TOKEN FORMAT CONTRACT:
token text does NOT include leading whitespace.
Whether a token is word-initial is indicated by morph_pos == 0.
"""
from __future__ import annotations
import unittest
class TestTokenizerPublicAPI(unittest.TestCase):
"""Smoke tests for the public API surface."""
@classmethod
def setUpClass(cls) -> None:
from nedo_turkish_tokenizer import NedoTurkishTokenizer
cls.tok = NedoTurkishTokenizer()
def test_import_and_instantiate(self) -> None:
from nedo_turkish_tokenizer import NedoTurkishTokenizer
t = NedoTurkishTokenizer()
self.assertIsNotNone(t)
def test_version(self) -> None:
from nedo_turkish_tokenizer import __version__
self.assertEqual(__version__, "2.0.0")
def test_empty_input(self) -> None:
self.assertEqual(self.tok.tokenize(""), [])
self.assertEqual(self.tok.tokenize(" "), [])
def test_callable_shorthand(self) -> None:
result = self.tok("Merhaba")
self.assertTrue(len(result) > 0)
def test_token_dict_fields(self) -> None:
tokens = self.tok.tokenize("ev")
self.assertTrue(len(tokens) >= 1)
t = tokens[0]
self.assertIn("token", t)
self.assertIn("token_type", t)
self.assertIn("morph_pos", t)
def test_batch_tokenize(self) -> None:
texts = ["ev", "araba", "merhaba"]
results = self.tok.batch_tokenize(texts, chunk_size=1000)
self.assertEqual(len(results), 3)
for r in results:
self.assertIsInstance(r, list)
self.assertTrue(len(r) >= 1)
def test_stats(self) -> None:
tokens = self.tok.tokenize("evde oturuyorum")
stats = self.tok.stats(tokens)
self.assertIn("total", stats)
self.assertIn("roots", stats)
self.assertIn("suffixes", stats)
self.assertIn("tr_pct", stats)
self.assertGreater(stats["total"], 0)
class TestTokenFormat(unittest.TestCase):
"""Token text must NOT include leading whitespace."""
@classmethod
def setUpClass(cls) -> None:
from nedo_turkish_tokenizer import NedoTurkishTokenizer
cls.tok = NedoTurkishTokenizer()
def test_no_leading_space_root(self) -> None:
tokens = self.tok.tokenize("merhaba")
self.assertEqual(tokens[0]["token"], "merhaba")
def test_no_leading_space_suffix(self) -> None:
tokens = self.tok.tokenize("evde")
for t in tokens:
self.assertFalse(
t["token"].startswith(" "),
f"Token {t['token']!r} has a leading space",
)
def test_no_leading_space_url(self) -> None:
tokens = self.tok.tokenize("https://example.com")
self.assertEqual(tokens[0]["token"], "https://example.com")
def test_no_leading_space_num(self) -> None:
tokens = self.tok.tokenize("%85")
self.assertEqual(tokens[0]["token"], "%85")
def test_no_leading_space_any_token(self) -> None:
"""No token in the output should ever start with a space."""
text = "İstanbul'da meeting'e katılamadım https://example.com %85"
tokens = self.tok.tokenize(text)
for t in tokens:
self.assertFalse(
t["token"].startswith(" "),
f"Token {t['token']!r} (type={t['token_type']}) has a leading space",
)
class TestBasicTurkish(unittest.TestCase):
"""Core Turkish morphology tokenization."""
@classmethod
def setUpClass(cls) -> None:
from nedo_turkish_tokenizer import NedoTurkishTokenizer
cls.tok = NedoTurkishTokenizer()
def _roots(self, text: str) -> list[str]:
return [t["token"] for t in self.tok.tokenize(text) if t["token_type"] == "ROOT"]
def _types(self, text: str) -> list[str]:
return [t["token_type"] for t in self.tok.tokenize(text)]
def _suffixes(self, text: str) -> list[str]:
return [t["token"] for t in self.tok.tokenize(text) if t["token_type"] == "SUFFIX"]
# ── Single words ─────────────────────────────────────────────────────
def test_simple_root(self) -> None:
tokens = self.tok.tokenize("merhaba")
self.assertEqual(tokens[0]["token"], "merhaba")
self.assertEqual(tokens[0]["token_type"], "ROOT")
def test_whole_word_tdk_preserved(self) -> None:
"""'dünya' is in TDK — must NOT be split into 'dün' + 'ya'."""
roots = self._roots("dünya")
self.assertIn("dünya", roots)
def test_suffix_loc(self) -> None:
tokens = self.tok.tokenize("evde")
self.assertEqual(tokens[0]["token"], "ev")
self.assertEqual(tokens[0]["token_type"], "ROOT")
self.assertEqual(tokens[1]["token"], "de")
self.assertEqual(tokens[1]["token_type"], "SUFFIX")
def test_suffix_plural_acc(self) -> None:
tokens = self.tok.tokenize("kitapları")
self.assertEqual(tokens[0]["token"], "kitap")
types = [t["token_type"] for t in tokens]
self.assertIn("SUFFIX", types)
def test_verb_stem_past(self) -> None:
"""Verb stems derived from infinitives must be found."""
roots = self._roots("geldim")
self.assertIn("gel", roots)
def test_verb_stem_progressive(self) -> None:
roots = self._roots("geliyorum")
self.assertIn("gel", roots)
def test_verb_otur(self) -> None:
roots = self._roots("oturuyorum")
self.assertIn("otur", roots)
def test_katil_root(self) -> None:
roots = self._roots("katılamadım")
self.assertIn("katıl", roots)
def test_longer_root_wins(self) -> None:
"""'toplantısı' should segment as 'toplantı' + 'sı', not 'toplan' + 'tı' + 'sı'."""
roots = self._roots("toplantısı")
self.assertIn("toplantı", roots)
def test_morph_pos_increments(self) -> None:
tokens = self.tok.tokenize("evlerden")
suffix_positions = [t["morph_pos"] for t in tokens if t["token_type"] == "SUFFIX"]
for i, pos in enumerate(suffix_positions):
self.assertGreater(pos, 0, f"Suffix at index {i} should have morph_pos > 0")
class TestFalseSuffixSplits(unittest.TestCase):
"""Regression tests: common words that must NOT be over-segmented.
These words look like root+suffix but are standalone units.
"""
@classmethod
def setUpClass(cls) -> None:
from nedo_turkish_tokenizer import NedoTurkishTokenizer
cls.tok = NedoTurkishTokenizer()
def _assert_single_root(self, word: str) -> None:
"""Assert that *word* tokenizes to exactly one ROOT token."""
tokens = self.tok.tokenize(word)
roots = [t for t in tokens if t["token_type"] == "ROOT"]
self.assertEqual(
len(roots), 1,
f"'{word}' should be a single ROOT, got: "
f"{[(t['token'], t['token_type']) for t in tokens]}",
)
self.assertEqual(len(tokens), 1, f"'{word}' should produce 1 token, got {len(tokens)}")
self.assertEqual(tokens[0]["token"], word)
# ── Forms of "demek" (to say) ────────────────────────────────────────
# Stem "de" is a TDK conjunction, causing false splits like de+di.
def test_dedi(self) -> None:
self._assert_single_root("dedi")
def test_dedim(self) -> None:
self._assert_single_root("dedim")
def test_demis(self) -> None:
self._assert_single_root("demiş")
def test_denir(self) -> None:
self._assert_single_root("denir")
def test_dese(self) -> None:
self._assert_single_root("dese")
# ── Discourse particles / conjunctions ───────────────────────────────
# These are in TDK and should be protected by WHOLE_WORD_BONUS.
def test_yani(self) -> None:
self._assert_single_root("yani")
def test_belki(self) -> None:
self._assert_single_root("belki")
def test_cunku(self) -> None:
self._assert_single_root("çünkü")
def test_sanki(self) -> None:
self._assert_single_root("sanki")
# ── "dedi mi" phrase ─────────────────────────────────────────────────
def test_dedi_mi(self) -> None:
tokens = self.tok.tokenize("dedi mi")
roots = [t for t in tokens if t["token_type"] == "ROOT"]
self.assertEqual(len(roots), 2, "Both 'dedi' and 'mi' should be roots")
root_texts = [t["token"] for t in roots]
self.assertIn("dedi", root_texts)
# ── TDK-protected words should never be split ────────────────────────
def test_bile(self) -> None:
self._assert_single_root("bile")
def test_daha(self) -> None:
self._assert_single_root("daha")
class TestApostrophe(unittest.TestCase):
"""Apostrophe handling for Turkish proper names and foreign stems."""
@classmethod
def setUpClass(cls) -> None:
from nedo_turkish_tokenizer import NedoTurkishTokenizer
cls.tok = NedoTurkishTokenizer()
def test_turkish_proper_name(self) -> None:
"""İstanbul'da → ROOT + PUNCT(') + SUFFIX(da)."""
tokens = self.tok.tokenize("İstanbul'da")
types = [t["token_type"] for t in tokens]
self.assertIn("ROOT", types)
self.assertIn("PUNCT", types)
self.assertIn("SUFFIX", types)
def test_foreign_stem(self) -> None:
"""meeting'e → FOREIGN + SUFFIX."""
tokens = self.tok.tokenize("meeting'e")
types = [t["token_type"] for t in tokens]
self.assertIn("FOREIGN", types)
self.assertIn("SUFFIX", types)
def test_apostrophe_suffix_label(self) -> None:
tokens = self.tok.tokenize("İstanbul'da")
suffix_tokens = [t for t in tokens if t["token_type"] == "SUFFIX"]
self.assertTrue(len(suffix_tokens) >= 1)
self.assertEqual(suffix_tokens[0].get("_suffix_label"), "-LOC")
class TestSpecialSpans(unittest.TestCase):
"""URL, date, number, acronym, emoji detection."""
@classmethod
def setUpClass(cls) -> None:
from nedo_turkish_tokenizer import NedoTurkishTokenizer
cls.tok = NedoTurkishTokenizer()
def _find_type(self, text: str, ttype: str) -> list[dict]:
return [t for t in self.tok.tokenize(text) if t["token_type"] == ttype]
def test_url_detection(self) -> None:
urls = self._find_type("https://example.com sitesine bak", "URL")
self.assertEqual(len(urls), 1)
self.assertIn("example.com", urls[0]["token"])
def test_date_detection(self) -> None:
dates = self._find_type("14.03.2026 tarihinde", "DATE")
self.assertEqual(len(dates), 1)
def test_number_detection(self) -> None:
nums = self._find_type("%85 başarı", "NUM")
self.assertEqual(len(nums), 1)
def test_acronym_detection(self) -> None:
tokens = self.tok.tokenize("NATO güçlü")
acr = [t for t in tokens if t["token_type"] == "ACRONYM"]
self.assertEqual(len(acr), 1)
self.assertTrue(acr[0].get("_expansion"))
def test_mention_detection(self) -> None:
mentions = self._find_type("@kullanici çok iyi", "MENTION")
self.assertEqual(len(mentions), 1)
def test_hashtag_detection(self) -> None:
tags = self._find_type("#türkiye çok güzel", "HASHTAG")
self.assertEqual(len(tags), 1)
class TestAllCaps(unittest.TestCase):
"""ALL CAPS word handling."""
@classmethod
def setUpClass(cls) -> None:
from nedo_turkish_tokenizer import NedoTurkishTokenizer
cls.tok = NedoTurkishTokenizer()
def test_caps_detected(self) -> None:
tokens = self.tok.tokenize("İSTANBUL güzel")
istanbul_tok = [t for t in tokens if "istanbul" in t["token"]]
self.assertTrue(len(istanbul_tok) >= 1)
self.assertTrue(istanbul_tok[0].get("_caps"))
def test_caps_lowered(self) -> None:
tokens = self.tok.tokenize("İSTANBUL")
self.assertEqual(tokens[0]["token"], "istanbul")
def test_caps_acronym(self) -> None:
"""Known acronyms in ALL CAPS should be ACRONYM type."""
tokens = self.tok.tokenize("TBMM toplantısı")
tbmm = [t for t in tokens if t["token_type"] == "ACRONYM"]
self.assertTrue(len(tbmm) >= 1)
class TestCanonicalLabels(unittest.TestCase):
"""Allomorph canonicalization metadata."""
@classmethod
def setUpClass(cls) -> None:
from nedo_turkish_tokenizer import NedoTurkishTokenizer
cls.tok = NedoTurkishTokenizer()
def test_loc_canonical(self) -> None:
tokens = self.tok.tokenize("evde")
suffix = [t for t in tokens if t["token_type"] == "SUFFIX"]
self.assertTrue(any(t.get("_canonical") == "LOC" for t in suffix))
def test_pl_canonical(self) -> None:
tokens = self.tok.tokenize("evler")
suffix = [t for t in tokens if t["token_type"] == "SUFFIX"]
self.assertTrue(any(t.get("_canonical") == "PL" for t in suffix))
class TestCompoundAnnotation(unittest.TestCase):
"""Compound word detection."""
@classmethod
def setUpClass(cls) -> None:
from nedo_turkish_tokenizer import NedoTurkishTokenizer
cls.tok = NedoTurkishTokenizer()
def test_known_compound(self) -> None:
tokens = self.tok.tokenize("başbakan")
root = [t for t in tokens if t["token_type"] == "ROOT"]
if root and root[0]["token"] == "başbakan":
self.assertTrue(root[0].get("_compound"))
self.assertIn("baş", root[0].get("_parts", []))
class TestNoDependencies(unittest.TestCase):
"""Verify no external runtime dependencies are imported."""
def test_no_external_imports(self) -> None:
import ast
from pathlib import Path
pkg_dir = Path(__file__).parent.parent / "nedo_turkish_tokenizer"
banned = {"turkish_tokenizer", "zemberek", "requests", "transformers"}
for py_file in pkg_dir.glob("*.py"):
tree = ast.parse(py_file.read_text(encoding="utf-8"))
for node in ast.walk(tree):
if isinstance(node, ast.Import):
for alias in node.names:
top = alias.name.split(".")[0]
self.assertNotIn(
top, banned,
f"{py_file.name} imports banned dependency: {alias.name}"
)
elif isinstance(node, ast.ImportFrom):
if node.module:
top = node.module.split(".")[0]
self.assertNotIn(
top, banned,
f"{py_file.name} imports banned dependency: {node.module}"
)
class TestEdgeCases(unittest.TestCase):
"""Edge cases and regression guards."""
@classmethod
def setUpClass(cls) -> None:
from nedo_turkish_tokenizer import NedoTurkishTokenizer
cls.tok = NedoTurkishTokenizer()
def test_punctuation_only(self) -> None:
tokens = self.tok.tokenize("...")
self.assertTrue(all(t["token_type"] == "PUNCT" for t in tokens))
def test_mixed_punctuation(self) -> None:
tokens = self.tok.tokenize('"Merhaba," dedi.')
types = [t["token_type"] for t in tokens]
self.assertIn("PUNCT", types)
self.assertIn("ROOT", types)
def test_unicode_normalized(self) -> None:
tokens = self.tok.tokenize(" merhaba dünya ")
roots = [t["token"] for t in tokens if t["token_type"] == "ROOT"]
self.assertIn("merhaba", roots)
self.assertIn("dünya", roots)
def test_single_char_word(self) -> None:
tokens = self.tok.tokenize("a")
self.assertTrue(len(tokens) >= 1)
def test_number_apostrophe_suffix(self) -> None:
"""3'te, 1990'larda should be NUM + SUFFIX."""
tokens = self.tok.tokenize("3'te geldim")
num = [t for t in tokens if t["token_type"] == "NUM"]
self.assertTrue(len(num) >= 1)
def test_integration_full_sentence(self) -> None:
"""Full integration test with mixed content."""
tokens = self.tok.tokenize("İstanbul'da meeting'e katılamadım")
self.assertTrue(len(tokens) > 0)
# Verify the critical acceptance criteria
from nedo_turkish_tokenizer import NedoTurkishTokenizer
t = NedoTurkishTokenizer()
result = t.tokenize("İstanbul'da meeting'e katılamadım")
self.assertIsInstance(result, list)
self.assertTrue(all("token" in tok and "token_type" in tok for tok in result))
if __name__ == "__main__":
unittest.main()