File size: 3,093 Bytes
3cf4fff
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
# Copyright (c) Meta Platforms, Inc. and affiliates.
# This software may be used and distributed in accordance with the terms of the Llama 3 Community License Agreement.

import os
from unittest import TestCase

from tokenizer import ChatFormat, Llama3Tokenizer

# From current directory:
# TOKENIZER_PATH=<path> python -m unittest llama3_tokenizer_test.py


class TokenizerTests(TestCase):
    def setUp(self):
        self.tokenizer = Llama3Tokenizer(os.environ["TOKENIZER_PATH"])
        self.format = ChatFormat(self.tokenizer)

    def test_special_tokens(self):
        self.assertEqual(
            self.tokenizer.special_tokens["<|begin_of_text|>"],
            128000,
        )

    def test_encode(self):
        self.assertEqual(
            self.tokenizer.encode(
                "This is a test sentence.", add_bos=True, add_eos=True
            ),
            [128000, 2028, 374, 264, 1296, 11914, 13, 128001],
        )

    def test_decode(self):
        self.assertEqual(
            self.tokenizer.decode(
                [128000, 2028, 374, 264, 1296, 11914, 13, 128001],
            ),
            "<|begin_of_text|>This is a test sentence.<|end_of_text|>",
        )

    def test_encode_message(self):
        message = {
            "role": "user",
            "content": "This is a test sentence.",
        }
        self.assertEqual(
            self.format.encode_message(message),
            [
                128006,  # <|start_header_id|>
                882,  # "user"
                128007,  # <|end_header_id|>
                271,  # "\n\n"
                2028,
                374,
                264,
                1296,
                11914,
                13,  # This is a test sentence.
                128009,  # <|eot_id|>
            ],
        )

    def test_encode_dialog(self):
        dialog = [
            {
                "role": "system",
                "content": "This is a test sentence.",
            },
            {
                "role": "user",
                "content": "This is a response.",
            },
        ]
        self.assertEqual(
            self.format.encode_dialog_prompt(dialog),
            [
                128000,  # <|begin_of_text|>
                128006,  # <|start_header_id|>
                9125,  # "system"
                128007,  # <|end_header_id|>
                271,  # "\n\n"
                2028,
                374,
                264,
                1296,
                11914,
                13,  # "This is a test sentence."
                128009,  # <|eot_id|>
                128006,  # <|start_header_id|>
                882,  # "user"
                128007,  # <|end_header_id|>
                271,  # "\n\n"
                2028,
                374,
                264,
                2077,
                13,  # "This is a response.",
                128009,  # <|eot_id|>
                128006,  # <|start_header_id|>
                78191,  # "assistant"
                128007,  # <|end_header_id|>
                271,  # "\n\n"
            ],
        )