File size: 2,038 Bytes
708f4a3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 |
import unittest
import sys
from pathlib import Path
from crayon.core.vocabulary import CrayonVocab
from crayon.core.primitives import TokenMetadata
class TestCoreTokenization(unittest.TestCase):
@classmethod
def setUpClass(cls):
cls.tokens = ["un", "fortunate", "ly", "unfortunate", "man"]
# Building the vocab will assign IDs:
# un:0, fortunate:1, ly:2, unfortunate:3, man:4
cls.vocab = CrayonVocab(cls.tokens)
def test_longest_match_priority(self):
"""
Verify that the tokenizer strictly prefers the longest match.
'unfortunately' -> 'unfortunate' + 'ly' (if 'unfortunately' not in vocab)
"""
text = "unfortunately"
ids = self.vocab.tokenize(text)
# 'unfortunate' (3) and 'ly' (2)
# resolved_tokens = [self.vocab.id_to_token[i] for i in ids]
self.assertEqual(ids, [3, 2])
# Verify decoding
decoded = self.vocab.decode(ids)
self.assertEqual(decoded, "unfortunately")
def test_unknown_token_fallback(self):
"""Verify fallback for unknown bytes/tokens."""
text = "x" # 'x' is unknown
ids = self.vocab.tokenize(text)
# Engine hardcoded fallback is ID 1
self.assertIn(1, ids)
def test_metadata_memory_layout(self):
"""Verify primitives use slots."""
meta = TokenMetadata(token_id=1, frequency=100, average_length=5.5)
# Frozen dataclasses raise FrozenInstanceError (Python 3.10+) or TypeError
with self.assertRaises((AttributeError, TypeError)):
meta.new_attr = 1 # Should fail due to __slots__ and frozen=True
def test_vocabulary_contains(self):
"""Test vocabulary membership checks."""
self.assertIn("unfortunate", self.vocab)
self.assertNotIn("nonexistent", self.vocab)
def test_vocabulary_size(self):
"""Test vocabulary size."""
self.assertEqual(len(self.vocab), 5)
if __name__ == "__main__":
unittest.main() |