File size: 17,304 Bytes
edec8b7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 | """Comprehensive regression test suite for NedoTurkishTokenizer.
Tests the public API and core segmentation with gold-standard examples
covering: basic Turkish, suffix chains, apostrophes, foreign words,
acronyms, special spans, ALL CAPS, compound words, and edge cases.
TOKEN FORMAT CONTRACT:
token text does NOT include leading whitespace.
Whether a token is word-initial is indicated by morph_pos == 0.
"""
from __future__ import annotations
import unittest
class TestTokenizerPublicAPI(unittest.TestCase):
"""Smoke tests for the public API surface."""
@classmethod
def setUpClass(cls) -> None:
from nedo_turkish_tokenizer import NedoTurkishTokenizer
cls.tok = NedoTurkishTokenizer()
def test_import_and_instantiate(self) -> None:
from nedo_turkish_tokenizer import NedoTurkishTokenizer
t = NedoTurkishTokenizer()
self.assertIsNotNone(t)
def test_version(self) -> None:
from nedo_turkish_tokenizer import __version__
self.assertEqual(__version__, "2.0.0")
def test_empty_input(self) -> None:
self.assertEqual(self.tok.tokenize(""), [])
self.assertEqual(self.tok.tokenize(" "), [])
def test_callable_shorthand(self) -> None:
result = self.tok("Merhaba")
self.assertTrue(len(result) > 0)
def test_token_dict_fields(self) -> None:
tokens = self.tok.tokenize("ev")
self.assertTrue(len(tokens) >= 1)
t = tokens[0]
self.assertIn("token", t)
self.assertIn("token_type", t)
self.assertIn("morph_pos", t)
def test_batch_tokenize(self) -> None:
texts = ["ev", "araba", "merhaba"]
results = self.tok.batch_tokenize(texts, chunk_size=1000)
self.assertEqual(len(results), 3)
for r in results:
self.assertIsInstance(r, list)
self.assertTrue(len(r) >= 1)
def test_stats(self) -> None:
tokens = self.tok.tokenize("evde oturuyorum")
stats = self.tok.stats(tokens)
self.assertIn("total", stats)
self.assertIn("roots", stats)
self.assertIn("suffixes", stats)
self.assertIn("tr_pct", stats)
self.assertGreater(stats["total"], 0)
class TestTokenFormat(unittest.TestCase):
"""Token text must NOT include leading whitespace."""
@classmethod
def setUpClass(cls) -> None:
from nedo_turkish_tokenizer import NedoTurkishTokenizer
cls.tok = NedoTurkishTokenizer()
def test_no_leading_space_root(self) -> None:
tokens = self.tok.tokenize("merhaba")
self.assertEqual(tokens[0]["token"], "merhaba")
def test_no_leading_space_suffix(self) -> None:
tokens = self.tok.tokenize("evde")
for t in tokens:
self.assertFalse(
t["token"].startswith(" "),
f"Token {t['token']!r} has a leading space",
)
def test_no_leading_space_url(self) -> None:
tokens = self.tok.tokenize("https://example.com")
self.assertEqual(tokens[0]["token"], "https://example.com")
def test_no_leading_space_num(self) -> None:
tokens = self.tok.tokenize("%85")
self.assertEqual(tokens[0]["token"], "%85")
def test_no_leading_space_any_token(self) -> None:
"""No token in the output should ever start with a space."""
text = "İstanbul'da meeting'e katılamadım https://example.com %85"
tokens = self.tok.tokenize(text)
for t in tokens:
self.assertFalse(
t["token"].startswith(" "),
f"Token {t['token']!r} (type={t['token_type']}) has a leading space",
)
class TestBasicTurkish(unittest.TestCase):
"""Core Turkish morphology tokenization."""
@classmethod
def setUpClass(cls) -> None:
from nedo_turkish_tokenizer import NedoTurkishTokenizer
cls.tok = NedoTurkishTokenizer()
def _roots(self, text: str) -> list[str]:
return [t["token"] for t in self.tok.tokenize(text) if t["token_type"] == "ROOT"]
def _types(self, text: str) -> list[str]:
return [t["token_type"] for t in self.tok.tokenize(text)]
def _suffixes(self, text: str) -> list[str]:
return [t["token"] for t in self.tok.tokenize(text) if t["token_type"] == "SUFFIX"]
# ── Single words ─────────────────────────────────────────────────────
def test_simple_root(self) -> None:
tokens = self.tok.tokenize("merhaba")
self.assertEqual(tokens[0]["token"], "merhaba")
self.assertEqual(tokens[0]["token_type"], "ROOT")
def test_whole_word_tdk_preserved(self) -> None:
"""'dünya' is in TDK — must NOT be split into 'dün' + 'ya'."""
roots = self._roots("dünya")
self.assertIn("dünya", roots)
def test_suffix_loc(self) -> None:
tokens = self.tok.tokenize("evde")
self.assertEqual(tokens[0]["token"], "ev")
self.assertEqual(tokens[0]["token_type"], "ROOT")
self.assertEqual(tokens[1]["token"], "de")
self.assertEqual(tokens[1]["token_type"], "SUFFIX")
def test_suffix_plural_acc(self) -> None:
tokens = self.tok.tokenize("kitapları")
self.assertEqual(tokens[0]["token"], "kitap")
types = [t["token_type"] for t in tokens]
self.assertIn("SUFFIX", types)
def test_verb_stem_past(self) -> None:
"""Verb stems derived from infinitives must be found."""
roots = self._roots("geldim")
self.assertIn("gel", roots)
def test_verb_stem_progressive(self) -> None:
roots = self._roots("geliyorum")
self.assertIn("gel", roots)
def test_verb_otur(self) -> None:
roots = self._roots("oturuyorum")
self.assertIn("otur", roots)
def test_katil_root(self) -> None:
roots = self._roots("katılamadım")
self.assertIn("katıl", roots)
def test_longer_root_wins(self) -> None:
"""'toplantısı' should segment as 'toplantı' + 'sı', not 'toplan' + 'tı' + 'sı'."""
roots = self._roots("toplantısı")
self.assertIn("toplantı", roots)
def test_morph_pos_increments(self) -> None:
tokens = self.tok.tokenize("evlerden")
suffix_positions = [t["morph_pos"] for t in tokens if t["token_type"] == "SUFFIX"]
for i, pos in enumerate(suffix_positions):
self.assertGreater(pos, 0, f"Suffix at index {i} should have morph_pos > 0")
class TestFalseSuffixSplits(unittest.TestCase):
"""Regression tests: common words that must NOT be over-segmented.
These words look like root+suffix but are standalone units.
"""
@classmethod
def setUpClass(cls) -> None:
from nedo_turkish_tokenizer import NedoTurkishTokenizer
cls.tok = NedoTurkishTokenizer()
def _assert_single_root(self, word: str) -> None:
"""Assert that *word* tokenizes to exactly one ROOT token."""
tokens = self.tok.tokenize(word)
roots = [t for t in tokens if t["token_type"] == "ROOT"]
self.assertEqual(
len(roots), 1,
f"'{word}' should be a single ROOT, got: "
f"{[(t['token'], t['token_type']) for t in tokens]}",
)
self.assertEqual(len(tokens), 1, f"'{word}' should produce 1 token, got {len(tokens)}")
self.assertEqual(tokens[0]["token"], word)
# ── Forms of "demek" (to say) ────────────────────────────────────────
# Stem "de" is a TDK conjunction, causing false splits like de+di.
def test_dedi(self) -> None:
self._assert_single_root("dedi")
def test_dedim(self) -> None:
self._assert_single_root("dedim")
def test_demis(self) -> None:
self._assert_single_root("demiş")
def test_denir(self) -> None:
self._assert_single_root("denir")
def test_dese(self) -> None:
self._assert_single_root("dese")
# ── Discourse particles / conjunctions ───────────────────────────────
# These are in TDK and should be protected by WHOLE_WORD_BONUS.
def test_yani(self) -> None:
self._assert_single_root("yani")
def test_belki(self) -> None:
self._assert_single_root("belki")
def test_cunku(self) -> None:
self._assert_single_root("çünkü")
def test_sanki(self) -> None:
self._assert_single_root("sanki")
# ── "dedi mi" phrase ─────────────────────────────────────────────────
def test_dedi_mi(self) -> None:
tokens = self.tok.tokenize("dedi mi")
roots = [t for t in tokens if t["token_type"] == "ROOT"]
self.assertEqual(len(roots), 2, "Both 'dedi' and 'mi' should be roots")
root_texts = [t["token"] for t in roots]
self.assertIn("dedi", root_texts)
# ── TDK-protected words should never be split ────────────────────────
def test_bile(self) -> None:
self._assert_single_root("bile")
def test_daha(self) -> None:
self._assert_single_root("daha")
class TestApostrophe(unittest.TestCase):
"""Apostrophe handling for Turkish proper names and foreign stems."""
@classmethod
def setUpClass(cls) -> None:
from nedo_turkish_tokenizer import NedoTurkishTokenizer
cls.tok = NedoTurkishTokenizer()
def test_turkish_proper_name(self) -> None:
"""İstanbul'da → ROOT + PUNCT(') + SUFFIX(da)."""
tokens = self.tok.tokenize("İstanbul'da")
types = [t["token_type"] for t in tokens]
self.assertIn("ROOT", types)
self.assertIn("PUNCT", types)
self.assertIn("SUFFIX", types)
def test_foreign_stem(self) -> None:
"""meeting'e → FOREIGN + SUFFIX."""
tokens = self.tok.tokenize("meeting'e")
types = [t["token_type"] for t in tokens]
self.assertIn("FOREIGN", types)
self.assertIn("SUFFIX", types)
def test_apostrophe_suffix_label(self) -> None:
tokens = self.tok.tokenize("İstanbul'da")
suffix_tokens = [t for t in tokens if t["token_type"] == "SUFFIX"]
self.assertTrue(len(suffix_tokens) >= 1)
self.assertEqual(suffix_tokens[0].get("_suffix_label"), "-LOC")
class TestSpecialSpans(unittest.TestCase):
"""URL, date, number, acronym, emoji detection."""
@classmethod
def setUpClass(cls) -> None:
from nedo_turkish_tokenizer import NedoTurkishTokenizer
cls.tok = NedoTurkishTokenizer()
def _find_type(self, text: str, ttype: str) -> list[dict]:
return [t for t in self.tok.tokenize(text) if t["token_type"] == ttype]
def test_url_detection(self) -> None:
urls = self._find_type("https://example.com sitesine bak", "URL")
self.assertEqual(len(urls), 1)
self.assertIn("example.com", urls[0]["token"])
def test_date_detection(self) -> None:
dates = self._find_type("14.03.2026 tarihinde", "DATE")
self.assertEqual(len(dates), 1)
def test_number_detection(self) -> None:
nums = self._find_type("%85 başarı", "NUM")
self.assertEqual(len(nums), 1)
def test_acronym_detection(self) -> None:
tokens = self.tok.tokenize("NATO güçlü")
acr = [t for t in tokens if t["token_type"] == "ACRONYM"]
self.assertEqual(len(acr), 1)
self.assertTrue(acr[0].get("_expansion"))
def test_mention_detection(self) -> None:
mentions = self._find_type("@kullanici çok iyi", "MENTION")
self.assertEqual(len(mentions), 1)
def test_hashtag_detection(self) -> None:
tags = self._find_type("#türkiye çok güzel", "HASHTAG")
self.assertEqual(len(tags), 1)
class TestAllCaps(unittest.TestCase):
"""ALL CAPS word handling."""
@classmethod
def setUpClass(cls) -> None:
from nedo_turkish_tokenizer import NedoTurkishTokenizer
cls.tok = NedoTurkishTokenizer()
def test_caps_detected(self) -> None:
tokens = self.tok.tokenize("İSTANBUL güzel")
istanbul_tok = [t for t in tokens if "istanbul" in t["token"]]
self.assertTrue(len(istanbul_tok) >= 1)
self.assertTrue(istanbul_tok[0].get("_caps"))
def test_caps_lowered(self) -> None:
tokens = self.tok.tokenize("İSTANBUL")
self.assertEqual(tokens[0]["token"], "istanbul")
def test_caps_acronym(self) -> None:
"""Known acronyms in ALL CAPS should be ACRONYM type."""
tokens = self.tok.tokenize("TBMM toplantısı")
tbmm = [t for t in tokens if t["token_type"] == "ACRONYM"]
self.assertTrue(len(tbmm) >= 1)
class TestCanonicalLabels(unittest.TestCase):
"""Allomorph canonicalization metadata."""
@classmethod
def setUpClass(cls) -> None:
from nedo_turkish_tokenizer import NedoTurkishTokenizer
cls.tok = NedoTurkishTokenizer()
def test_loc_canonical(self) -> None:
tokens = self.tok.tokenize("evde")
suffix = [t for t in tokens if t["token_type"] == "SUFFIX"]
self.assertTrue(any(t.get("_canonical") == "LOC" for t in suffix))
def test_pl_canonical(self) -> None:
tokens = self.tok.tokenize("evler")
suffix = [t for t in tokens if t["token_type"] == "SUFFIX"]
self.assertTrue(any(t.get("_canonical") == "PL" for t in suffix))
class TestCompoundAnnotation(unittest.TestCase):
"""Compound word detection."""
@classmethod
def setUpClass(cls) -> None:
from nedo_turkish_tokenizer import NedoTurkishTokenizer
cls.tok = NedoTurkishTokenizer()
def test_known_compound(self) -> None:
tokens = self.tok.tokenize("başbakan")
root = [t for t in tokens if t["token_type"] == "ROOT"]
if root and root[0]["token"] == "başbakan":
self.assertTrue(root[0].get("_compound"))
self.assertIn("baş", root[0].get("_parts", []))
class TestNoDependencies(unittest.TestCase):
"""Verify no external runtime dependencies are imported."""
def test_no_external_imports(self) -> None:
import ast
from pathlib import Path
pkg_dir = Path(__file__).parent.parent / "nedo_turkish_tokenizer"
banned = {"turkish_tokenizer", "zemberek", "requests", "transformers"}
for py_file in pkg_dir.glob("*.py"):
tree = ast.parse(py_file.read_text(encoding="utf-8"))
for node in ast.walk(tree):
if isinstance(node, ast.Import):
for alias in node.names:
top = alias.name.split(".")[0]
self.assertNotIn(
top, banned,
f"{py_file.name} imports banned dependency: {alias.name}"
)
elif isinstance(node, ast.ImportFrom):
if node.module:
top = node.module.split(".")[0]
self.assertNotIn(
top, banned,
f"{py_file.name} imports banned dependency: {node.module}"
)
class TestEdgeCases(unittest.TestCase):
"""Edge cases and regression guards."""
@classmethod
def setUpClass(cls) -> None:
from nedo_turkish_tokenizer import NedoTurkishTokenizer
cls.tok = NedoTurkishTokenizer()
def test_punctuation_only(self) -> None:
tokens = self.tok.tokenize("...")
self.assertTrue(all(t["token_type"] == "PUNCT" for t in tokens))
def test_mixed_punctuation(self) -> None:
tokens = self.tok.tokenize('"Merhaba," dedi.')
types = [t["token_type"] for t in tokens]
self.assertIn("PUNCT", types)
self.assertIn("ROOT", types)
def test_unicode_normalized(self) -> None:
tokens = self.tok.tokenize(" merhaba dünya ")
roots = [t["token"] for t in tokens if t["token_type"] == "ROOT"]
self.assertIn("merhaba", roots)
self.assertIn("dünya", roots)
def test_single_char_word(self) -> None:
tokens = self.tok.tokenize("a")
self.assertTrue(len(tokens) >= 1)
def test_number_apostrophe_suffix(self) -> None:
"""3'te, 1990'larda should be NUM + SUFFIX."""
tokens = self.tok.tokenize("3'te geldim")
num = [t for t in tokens if t["token_type"] == "NUM"]
self.assertTrue(len(num) >= 1)
def test_integration_full_sentence(self) -> None:
"""Full integration test with mixed content."""
tokens = self.tok.tokenize("İstanbul'da meeting'e katılamadım")
self.assertTrue(len(tokens) > 0)
# Verify the critical acceptance criteria
from nedo_turkish_tokenizer import NedoTurkishTokenizer
t = NedoTurkishTokenizer()
result = t.tokenize("İstanbul'da meeting'e katılamadım")
self.assertIsInstance(result, list)
self.assertTrue(all("token" in tok and "token_type" in tok for tok in result))
if __name__ == "__main__":
unittest.main()
|