File size: 17,304 Bytes
edec8b7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
"""Comprehensive regression test suite for NedoTurkishTokenizer.

Tests the public API and core segmentation with gold-standard examples
covering: basic Turkish, suffix chains, apostrophes, foreign words,
acronyms, special spans, ALL CAPS, compound words, and edge cases.

TOKEN FORMAT CONTRACT:
    token text does NOT include leading whitespace.
    Whether a token is word-initial is indicated by morph_pos == 0.
"""

from __future__ import annotations

import unittest


class TestTokenizerPublicAPI(unittest.TestCase):
    """Smoke tests for the public API surface."""

    @classmethod
    def setUpClass(cls) -> None:
        from nedo_turkish_tokenizer import NedoTurkishTokenizer
        cls.tok = NedoTurkishTokenizer()

    def test_import_and_instantiate(self) -> None:
        from nedo_turkish_tokenizer import NedoTurkishTokenizer
        t = NedoTurkishTokenizer()
        self.assertIsNotNone(t)

    def test_version(self) -> None:
        from nedo_turkish_tokenizer import __version__
        self.assertEqual(__version__, "2.0.0")

    def test_empty_input(self) -> None:
        self.assertEqual(self.tok.tokenize(""), [])
        self.assertEqual(self.tok.tokenize("   "), [])

    def test_callable_shorthand(self) -> None:
        result = self.tok("Merhaba")
        self.assertTrue(len(result) > 0)

    def test_token_dict_fields(self) -> None:
        tokens = self.tok.tokenize("ev")
        self.assertTrue(len(tokens) >= 1)
        t = tokens[0]
        self.assertIn("token", t)
        self.assertIn("token_type", t)
        self.assertIn("morph_pos", t)

    def test_batch_tokenize(self) -> None:
        texts = ["ev", "araba", "merhaba"]
        results = self.tok.batch_tokenize(texts, chunk_size=1000)
        self.assertEqual(len(results), 3)
        for r in results:
            self.assertIsInstance(r, list)
            self.assertTrue(len(r) >= 1)

    def test_stats(self) -> None:
        tokens = self.tok.tokenize("evde oturuyorum")
        stats = self.tok.stats(tokens)
        self.assertIn("total", stats)
        self.assertIn("roots", stats)
        self.assertIn("suffixes", stats)
        self.assertIn("tr_pct", stats)
        self.assertGreater(stats["total"], 0)


class TestTokenFormat(unittest.TestCase):
    """Token text must NOT include leading whitespace."""

    @classmethod
    def setUpClass(cls) -> None:
        from nedo_turkish_tokenizer import NedoTurkishTokenizer
        cls.tok = NedoTurkishTokenizer()

    def test_no_leading_space_root(self) -> None:
        tokens = self.tok.tokenize("merhaba")
        self.assertEqual(tokens[0]["token"], "merhaba")

    def test_no_leading_space_suffix(self) -> None:
        tokens = self.tok.tokenize("evde")
        for t in tokens:
            self.assertFalse(
                t["token"].startswith(" "),
                f"Token {t['token']!r} has a leading space",
            )

    def test_no_leading_space_url(self) -> None:
        tokens = self.tok.tokenize("https://example.com")
        self.assertEqual(tokens[0]["token"], "https://example.com")

    def test_no_leading_space_num(self) -> None:
        tokens = self.tok.tokenize("%85")
        self.assertEqual(tokens[0]["token"], "%85")

    def test_no_leading_space_any_token(self) -> None:
        """No token in the output should ever start with a space."""
        text = "İstanbul'da meeting'e katılamadım https://example.com %85"
        tokens = self.tok.tokenize(text)
        for t in tokens:
            self.assertFalse(
                t["token"].startswith(" "),
                f"Token {t['token']!r} (type={t['token_type']}) has a leading space",
            )


class TestBasicTurkish(unittest.TestCase):
    """Core Turkish morphology tokenization."""

    @classmethod
    def setUpClass(cls) -> None:
        from nedo_turkish_tokenizer import NedoTurkishTokenizer
        cls.tok = NedoTurkishTokenizer()

    def _roots(self, text: str) -> list[str]:
        return [t["token"] for t in self.tok.tokenize(text) if t["token_type"] == "ROOT"]

    def _types(self, text: str) -> list[str]:
        return [t["token_type"] for t in self.tok.tokenize(text)]

    def _suffixes(self, text: str) -> list[str]:
        return [t["token"] for t in self.tok.tokenize(text) if t["token_type"] == "SUFFIX"]

    # ── Single words ─────────────────────────────────────────────────────

    def test_simple_root(self) -> None:
        tokens = self.tok.tokenize("merhaba")
        self.assertEqual(tokens[0]["token"], "merhaba")
        self.assertEqual(tokens[0]["token_type"], "ROOT")

    def test_whole_word_tdk_preserved(self) -> None:
        """'dünya' is in TDK — must NOT be split into 'dün' + 'ya'."""
        roots = self._roots("dünya")
        self.assertIn("dünya", roots)

    def test_suffix_loc(self) -> None:
        tokens = self.tok.tokenize("evde")
        self.assertEqual(tokens[0]["token"], "ev")
        self.assertEqual(tokens[0]["token_type"], "ROOT")
        self.assertEqual(tokens[1]["token"], "de")
        self.assertEqual(tokens[1]["token_type"], "SUFFIX")

    def test_suffix_plural_acc(self) -> None:
        tokens = self.tok.tokenize("kitapları")
        self.assertEqual(tokens[0]["token"], "kitap")
        types = [t["token_type"] for t in tokens]
        self.assertIn("SUFFIX", types)

    def test_verb_stem_past(self) -> None:
        """Verb stems derived from infinitives must be found."""
        roots = self._roots("geldim")
        self.assertIn("gel", roots)

    def test_verb_stem_progressive(self) -> None:
        roots = self._roots("geliyorum")
        self.assertIn("gel", roots)

    def test_verb_otur(self) -> None:
        roots = self._roots("oturuyorum")
        self.assertIn("otur", roots)

    def test_katil_root(self) -> None:
        roots = self._roots("katılamadım")
        self.assertIn("katıl", roots)

    def test_longer_root_wins(self) -> None:
        """'toplantısı' should segment as 'toplantı' + 'sı', not 'toplan' + 'tı' + 'sı'."""
        roots = self._roots("toplantısı")
        self.assertIn("toplantı", roots)

    def test_morph_pos_increments(self) -> None:
        tokens = self.tok.tokenize("evlerden")
        suffix_positions = [t["morph_pos"] for t in tokens if t["token_type"] == "SUFFIX"]
        for i, pos in enumerate(suffix_positions):
            self.assertGreater(pos, 0, f"Suffix at index {i} should have morph_pos > 0")


class TestFalseSuffixSplits(unittest.TestCase):
    """Regression tests: common words that must NOT be over-segmented.

    These words look like root+suffix but are standalone units.
    """

    @classmethod
    def setUpClass(cls) -> None:
        from nedo_turkish_tokenizer import NedoTurkishTokenizer
        cls.tok = NedoTurkishTokenizer()

    def _assert_single_root(self, word: str) -> None:
        """Assert that *word* tokenizes to exactly one ROOT token."""
        tokens = self.tok.tokenize(word)
        roots = [t for t in tokens if t["token_type"] == "ROOT"]
        self.assertEqual(
            len(roots), 1,
            f"'{word}' should be a single ROOT, got: "
            f"{[(t['token'], t['token_type']) for t in tokens]}",
        )
        self.assertEqual(len(tokens), 1, f"'{word}' should produce 1 token, got {len(tokens)}")
        self.assertEqual(tokens[0]["token"], word)

    # ── Forms of "demek" (to say) ────────────────────────────────────────
    # Stem "de" is a TDK conjunction, causing false splits like de+di.

    def test_dedi(self) -> None:
        self._assert_single_root("dedi")

    def test_dedim(self) -> None:
        self._assert_single_root("dedim")

    def test_demis(self) -> None:
        self._assert_single_root("demiş")

    def test_denir(self) -> None:
        self._assert_single_root("denir")

    def test_dese(self) -> None:
        self._assert_single_root("dese")

    # ── Discourse particles / conjunctions ───────────────────────────────
    # These are in TDK and should be protected by WHOLE_WORD_BONUS.

    def test_yani(self) -> None:
        self._assert_single_root("yani")

    def test_belki(self) -> None:
        self._assert_single_root("belki")

    def test_cunku(self) -> None:
        self._assert_single_root("çünkü")

    def test_sanki(self) -> None:
        self._assert_single_root("sanki")

    # ── "dedi mi" phrase ─────────────────────────────────────────────────

    def test_dedi_mi(self) -> None:
        tokens = self.tok.tokenize("dedi mi")
        roots = [t for t in tokens if t["token_type"] == "ROOT"]
        self.assertEqual(len(roots), 2, "Both 'dedi' and 'mi' should be roots")
        root_texts = [t["token"] for t in roots]
        self.assertIn("dedi", root_texts)

    # ── TDK-protected words should never be split ────────────────────────

    def test_bile(self) -> None:
        self._assert_single_root("bile")

    def test_daha(self) -> None:
        self._assert_single_root("daha")


class TestApostrophe(unittest.TestCase):
    """Apostrophe handling for Turkish proper names and foreign stems."""

    @classmethod
    def setUpClass(cls) -> None:
        from nedo_turkish_tokenizer import NedoTurkishTokenizer
        cls.tok = NedoTurkishTokenizer()

    def test_turkish_proper_name(self) -> None:
        """İstanbul'da → ROOT + PUNCT(') + SUFFIX(da)."""
        tokens = self.tok.tokenize("İstanbul'da")
        types = [t["token_type"] for t in tokens]
        self.assertIn("ROOT", types)
        self.assertIn("PUNCT", types)
        self.assertIn("SUFFIX", types)

    def test_foreign_stem(self) -> None:
        """meeting'e → FOREIGN + SUFFIX."""
        tokens = self.tok.tokenize("meeting'e")
        types = [t["token_type"] for t in tokens]
        self.assertIn("FOREIGN", types)
        self.assertIn("SUFFIX", types)

    def test_apostrophe_suffix_label(self) -> None:
        tokens = self.tok.tokenize("İstanbul'da")
        suffix_tokens = [t for t in tokens if t["token_type"] == "SUFFIX"]
        self.assertTrue(len(suffix_tokens) >= 1)
        self.assertEqual(suffix_tokens[0].get("_suffix_label"), "-LOC")


class TestSpecialSpans(unittest.TestCase):
    """URL, date, number, acronym, emoji detection."""

    @classmethod
    def setUpClass(cls) -> None:
        from nedo_turkish_tokenizer import NedoTurkishTokenizer
        cls.tok = NedoTurkishTokenizer()

    def _find_type(self, text: str, ttype: str) -> list[dict]:
        return [t for t in self.tok.tokenize(text) if t["token_type"] == ttype]

    def test_url_detection(self) -> None:
        urls = self._find_type("https://example.com sitesine bak", "URL")
        self.assertEqual(len(urls), 1)
        self.assertIn("example.com", urls[0]["token"])

    def test_date_detection(self) -> None:
        dates = self._find_type("14.03.2026 tarihinde", "DATE")
        self.assertEqual(len(dates), 1)

    def test_number_detection(self) -> None:
        nums = self._find_type("%85 başarı", "NUM")
        self.assertEqual(len(nums), 1)

    def test_acronym_detection(self) -> None:
        tokens = self.tok.tokenize("NATO güçlü")
        acr = [t for t in tokens if t["token_type"] == "ACRONYM"]
        self.assertEqual(len(acr), 1)
        self.assertTrue(acr[0].get("_expansion"))

    def test_mention_detection(self) -> None:
        mentions = self._find_type("@kullanici çok iyi", "MENTION")
        self.assertEqual(len(mentions), 1)

    def test_hashtag_detection(self) -> None:
        tags = self._find_type("#türkiye çok güzel", "HASHTAG")
        self.assertEqual(len(tags), 1)


class TestAllCaps(unittest.TestCase):
    """ALL CAPS word handling."""

    @classmethod
    def setUpClass(cls) -> None:
        from nedo_turkish_tokenizer import NedoTurkishTokenizer
        cls.tok = NedoTurkishTokenizer()

    def test_caps_detected(self) -> None:
        tokens = self.tok.tokenize("İSTANBUL güzel")
        istanbul_tok = [t for t in tokens if "istanbul" in t["token"]]
        self.assertTrue(len(istanbul_tok) >= 1)
        self.assertTrue(istanbul_tok[0].get("_caps"))

    def test_caps_lowered(self) -> None:
        tokens = self.tok.tokenize("İSTANBUL")
        self.assertEqual(tokens[0]["token"], "istanbul")

    def test_caps_acronym(self) -> None:
        """Known acronyms in ALL CAPS should be ACRONYM type."""
        tokens = self.tok.tokenize("TBMM toplantısı")
        tbmm = [t for t in tokens if t["token_type"] == "ACRONYM"]
        self.assertTrue(len(tbmm) >= 1)


class TestCanonicalLabels(unittest.TestCase):
    """Allomorph canonicalization metadata."""

    @classmethod
    def setUpClass(cls) -> None:
        from nedo_turkish_tokenizer import NedoTurkishTokenizer
        cls.tok = NedoTurkishTokenizer()

    def test_loc_canonical(self) -> None:
        tokens = self.tok.tokenize("evde")
        suffix = [t for t in tokens if t["token_type"] == "SUFFIX"]
        self.assertTrue(any(t.get("_canonical") == "LOC" for t in suffix))

    def test_pl_canonical(self) -> None:
        tokens = self.tok.tokenize("evler")
        suffix = [t for t in tokens if t["token_type"] == "SUFFIX"]
        self.assertTrue(any(t.get("_canonical") == "PL" for t in suffix))


class TestCompoundAnnotation(unittest.TestCase):
    """Compound word detection."""

    @classmethod
    def setUpClass(cls) -> None:
        from nedo_turkish_tokenizer import NedoTurkishTokenizer
        cls.tok = NedoTurkishTokenizer()

    def test_known_compound(self) -> None:
        tokens = self.tok.tokenize("başbakan")
        root = [t for t in tokens if t["token_type"] == "ROOT"]
        if root and root[0]["token"] == "başbakan":
            self.assertTrue(root[0].get("_compound"))
            self.assertIn("baş", root[0].get("_parts", []))


class TestNoDependencies(unittest.TestCase):
    """Verify no external runtime dependencies are imported."""

    def test_no_external_imports(self) -> None:
        import ast
        from pathlib import Path

        pkg_dir = Path(__file__).parent.parent / "nedo_turkish_tokenizer"
        banned = {"turkish_tokenizer", "zemberek", "requests", "transformers"}

        for py_file in pkg_dir.glob("*.py"):
            tree = ast.parse(py_file.read_text(encoding="utf-8"))
            for node in ast.walk(tree):
                if isinstance(node, ast.Import):
                    for alias in node.names:
                        top = alias.name.split(".")[0]
                        self.assertNotIn(
                            top, banned,
                            f"{py_file.name} imports banned dependency: {alias.name}"
                        )
                elif isinstance(node, ast.ImportFrom):
                    if node.module:
                        top = node.module.split(".")[0]
                        self.assertNotIn(
                            top, banned,
                            f"{py_file.name} imports banned dependency: {node.module}"
                        )


class TestEdgeCases(unittest.TestCase):
    """Edge cases and regression guards."""

    @classmethod
    def setUpClass(cls) -> None:
        from nedo_turkish_tokenizer import NedoTurkishTokenizer
        cls.tok = NedoTurkishTokenizer()

    def test_punctuation_only(self) -> None:
        tokens = self.tok.tokenize("...")
        self.assertTrue(all(t["token_type"] == "PUNCT" for t in tokens))

    def test_mixed_punctuation(self) -> None:
        tokens = self.tok.tokenize('"Merhaba," dedi.')
        types = [t["token_type"] for t in tokens]
        self.assertIn("PUNCT", types)
        self.assertIn("ROOT", types)

    def test_unicode_normalized(self) -> None:
        tokens = self.tok.tokenize("  merhaba   dünya  ")
        roots = [t["token"] for t in tokens if t["token_type"] == "ROOT"]
        self.assertIn("merhaba", roots)
        self.assertIn("dünya", roots)

    def test_single_char_word(self) -> None:
        tokens = self.tok.tokenize("a")
        self.assertTrue(len(tokens) >= 1)

    def test_number_apostrophe_suffix(self) -> None:
        """3'te, 1990'larda should be NUM + SUFFIX."""
        tokens = self.tok.tokenize("3'te geldim")
        num = [t for t in tokens if t["token_type"] == "NUM"]
        self.assertTrue(len(num) >= 1)

    def test_integration_full_sentence(self) -> None:
        """Full integration test with mixed content."""
        tokens = self.tok.tokenize("İstanbul'da meeting'e katılamadım")
        self.assertTrue(len(tokens) > 0)
        # Verify the critical acceptance criteria
        from nedo_turkish_tokenizer import NedoTurkishTokenizer
        t = NedoTurkishTokenizer()
        result = t.tokenize("İstanbul'da meeting'e katılamadım")
        self.assertIsInstance(result, list)
        self.assertTrue(all("token" in tok and "token_type" in tok for tok in result))


if __name__ == "__main__":
    unittest.main()