|
|
| import unittest |
| import sys |
| from pathlib import Path |
|
|
|
|
| from crayon.core.vocabulary import CrayonVocab |
| from crayon.core.primitives import TokenMetadata |
|
|
| class TestCoreTokenization(unittest.TestCase): |
| |
| @classmethod |
| def setUpClass(cls): |
| cls.tokens = ["un", "fortunate", "ly", "unfortunate", "man"] |
| |
| |
| cls.vocab = CrayonVocab(cls.tokens) |
|
|
| def test_longest_match_priority(self): |
| """ |
| Verify that the tokenizer strictly prefers the longest match. |
| 'unfortunately' -> 'unfortunate' + 'ly' (if 'unfortunately' not in vocab) |
| """ |
| text = "unfortunately" |
| ids = self.vocab.tokenize(text) |
| |
| |
| |
| self.assertEqual(ids, [3, 2]) |
| |
| |
| decoded = self.vocab.decode(ids) |
| self.assertEqual(decoded, "unfortunately") |
|
|
| def test_unknown_token_fallback(self): |
| """Verify fallback for unknown bytes/tokens.""" |
| text = "x" |
| ids = self.vocab.tokenize(text) |
| |
| |
| self.assertIn(1, ids) |
|
|
| def test_metadata_memory_layout(self): |
| """Verify primitives use slots.""" |
| meta = TokenMetadata(token_id=1, frequency=100, average_length=5.5) |
| |
| with self.assertRaises((AttributeError, TypeError)): |
| meta.new_attr = 1 |
|
|
| def test_vocabulary_contains(self): |
| """Test vocabulary membership checks.""" |
| self.assertIn("unfortunate", self.vocab) |
| self.assertNotIn("nonexistent", self.vocab) |
|
|
| def test_vocabulary_size(self): |
| """Test vocabulary size.""" |
| self.assertEqual(len(self.vocab), 5) |
|
|
| if __name__ == "__main__": |
| unittest.main() |