File size: 2,038 Bytes
708f4a3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60

import unittest
import sys
from pathlib import Path


from crayon.core.vocabulary import CrayonVocab
from crayon.core.primitives import TokenMetadata

class TestCoreTokenization(unittest.TestCase):
    
    @classmethod
    def setUpClass(cls):
        cls.tokens = ["un", "fortunate", "ly", "unfortunate", "man"]
        # Building the vocab will assign IDs: 
        # un:0, fortunate:1, ly:2, unfortunate:3, man:4
        cls.vocab = CrayonVocab(cls.tokens)

    def test_longest_match_priority(self):
        """
        Verify that the tokenizer strictly prefers the longest match.
        'unfortunately' -> 'unfortunate' + 'ly' (if 'unfortunately' not in vocab)
        """
        text = "unfortunately"
        ids = self.vocab.tokenize(text)
        
        # 'unfortunate' (3) and 'ly' (2)
        # resolved_tokens = [self.vocab.id_to_token[i] for i in ids]
        self.assertEqual(ids, [3, 2])
        
        # Verify decoding
        decoded = self.vocab.decode(ids)
        self.assertEqual(decoded, "unfortunately")

    def test_unknown_token_fallback(self):
        """Verify fallback for unknown bytes/tokens."""
        text = "x"  # 'x' is unknown
        ids = self.vocab.tokenize(text)
        
        # Engine hardcoded fallback is ID 1
        self.assertIn(1, ids)

    def test_metadata_memory_layout(self):
        """Verify primitives use slots."""
        meta = TokenMetadata(token_id=1, frequency=100, average_length=5.5)
        # Frozen dataclasses raise FrozenInstanceError (Python 3.10+) or TypeError
        with self.assertRaises((AttributeError, TypeError)):
            meta.new_attr = 1  # Should fail due to __slots__ and frozen=True

    def test_vocabulary_contains(self):
        """Test vocabulary membership checks."""
        self.assertIn("unfortunate", self.vocab)
        self.assertNotIn("nonexistent", self.vocab)

    def test_vocabulary_size(self):
        """Test vocabulary size."""
        self.assertEqual(len(self.vocab), 5)

if __name__ == "__main__":
    unittest.main()