OmniCoreX / tests /test_tokenizer.py
Kosasih's picture
Rename test/test_tokenizer.py to tests/test_tokenizer.py
9bbfc92 verified
raw
history blame contribute delete
936 Bytes
import unittest
from tokenizer import BPETokenizer
class TokenizerTest(unittest.TestCase):
def setUp(self):
self.vocab = {
"<PAD>":0,
"<UNK>":1,
"a":2,
"b":3,
"c":4,
"ab":5,
"bc":6,
"abc":7,
"</w>":8
}
self.merges = [["a","b"],["b","c"],["ab","c"]]
self.tokenizer = BPETokenizer(vocab=self.vocab, merges=self.merges)
def test_tokenize_and_encode(self):
tokens = self.tokenizer.tokenize("abc")
self.assertTrue(isinstance(tokens, list))
indices = self.tokenizer.encode("abc")
self.assertTrue(all(isinstance(i, int) for i in indices))
def test_decode(self):
encoded = self.tokenizer.encode("abc")
decoded = self.tokenizer.decode(encoded)
self.assertTrue(isinstance(decoded, str))
if __name__ == "__main__":
unittest.main()