File size: 936 Bytes
855cbe6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
import unittest
from tokenizer import BPETokenizer

class TokenizerTest(unittest.TestCase):
    def setUp(self):
        self.vocab = {
            "<PAD>":0,
            "<UNK>":1,
            "a":2,
            "b":3,
            "c":4,
            "ab":5,
            "bc":6,
            "abc":7,
            "</w>":8
        }
        self.merges = [["a","b"],["b","c"],["ab","c"]]
        self.tokenizer = BPETokenizer(vocab=self.vocab, merges=self.merges)

    def test_tokenize_and_encode(self):
        tokens = self.tokenizer.tokenize("abc")
        self.assertTrue(isinstance(tokens, list))
        indices = self.tokenizer.encode("abc")
        self.assertTrue(all(isinstance(i, int) for i in indices))

    def test_decode(self):
        encoded = self.tokenizer.encode("abc")
        decoded = self.tokenizer.decode(encoded)
        self.assertTrue(isinstance(decoded, str))

if __name__ == "__main__":
    unittest.main()