File size: 936 Bytes
855cbe6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 |
import unittest
from tokenizer import BPETokenizer
class TokenizerTest(unittest.TestCase):
def setUp(self):
self.vocab = {
"<PAD>":0,
"<UNK>":1,
"a":2,
"b":3,
"c":4,
"ab":5,
"bc":6,
"abc":7,
"</w>":8
}
self.merges = [["a","b"],["b","c"],["ab","c"]]
self.tokenizer = BPETokenizer(vocab=self.vocab, merges=self.merges)
def test_tokenize_and_encode(self):
tokens = self.tokenizer.tokenize("abc")
self.assertTrue(isinstance(tokens, list))
indices = self.tokenizer.encode("abc")
self.assertTrue(all(isinstance(i, int) for i in indices))
def test_decode(self):
encoded = self.tokenizer.encode("abc")
decoded = self.tokenizer.decode(encoded)
self.assertTrue(isinstance(decoded, str))
if __name__ == "__main__":
unittest.main()
|