File size: 10,365 Bytes
ca97aa9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
import { AlbertTokenizer } from "../../../src/tokenizers.js";
import { BASE_TEST_STRINGS, BERT_TEST_STRINGS } from "../test_strings.js";

export const TOKENIZER_CLASS = AlbertTokenizer;
export const TEST_CONFIG = {
  // - uses `StripAccents` normalizer
  "Xenova/albert-base-v2": {
    SIMPLE: {
      text: BASE_TEST_STRINGS.SIMPLE,
      tokens: ["\u2581how", "\u2581are", "\u2581you", "\u2581doing", "?"],
      ids: [2, 184, 50, 42, 845, 60, 3],
      decoded: "[CLS] how are you doing?[SEP]",
    },
    SIMPLE_WITH_PUNCTUATION: {
      text: BASE_TEST_STRINGS.SIMPLE_WITH_PUNCTUATION,
      tokens: ["\u2581you", "\u2581should", "'", "ve", "\u2581done", "\u2581this"],
      ids: [2, 42, 378, 22, 195, 677, 48, 3],
      decoded: "[CLS] you should've done this[SEP]",
    },
    NUMBERS: {
      text: BASE_TEST_STRINGS.NUMBERS,
      tokens: ["\u25810", "12", "345", "67", "89", "\u25810", "\u25811", "\u25812", "\u25813", "\u25814", "\u25815", "\u25816", "\u25817", "\u25818", "\u25819", "\u258110", "\u2581100", "\u25811000"],
      ids: [2, 713, 918, 21997, 4167, 3877, 713, 137, 172, 203, 268, 331, 400, 453, 469, 561, 332, 808, 6150, 3],
      decoded: "[CLS] 0123456789 0 1 2 3 4 5 6 7 8 9 10 100 1000[SEP]",
    },
    TEXT_WITH_NUMBERS: {
      text: BASE_TEST_STRINGS.TEXT_WITH_NUMBERS,
      tokens: ["\u2581the", "\u2581company", "\u2581was", "\u2581founded", "\u2581in", "\u25812016", "."],
      ids: [2, 14, 237, 23, 785, 19, 690, 9, 3],
      decoded: "[CLS] the company was founded in 2016.[SEP]",
    },
    PUNCTUATION: {
      text: BASE_TEST_STRINGS.PUNCTUATION,
      tokens: ["\u2581a", "\u2581", "'", "ll", "\u2581", "!!", "to", "?'", "d", '"', "d", "\u2581of", ",", "\u2581can", "'", "t", "."],
      ids: [2, 21, 13, 22, 211, 13, 19015, 262, 5663, 43, 7, 43, 16, 15, 92, 22, 38, 9, 3],
      decoded: "[CLS] a 'll!!to?'d\"d of, can't.[SEP]",
    },
    PYTHON_CODE: {
      text: BASE_TEST_STRINGS.PYTHON_CODE,
      tokens: ["\u2581def", "\u2581main", "(", ")", ":", "\u2581pass"],
      ids: [2, 6312, 407, 5, 6, 45, 1477, 3],
      decoded: "[CLS] def main(): pass[SEP]",
    },
    JAVASCRIPT_CODE: {
      text: BASE_TEST_STRINGS.JAVASCRIPT_CODE,
      tokens: ["\u2581let", "\u2581a", "\u2581=", "\u2581ob", "j", ".", "to", "string", "(", ")", ";", "\u2581to", "string", "(", ")", ";"],
      ids: [2, 408, 21, 800, 5122, 728, 9, 262, 11130, 5, 6, 73, 20, 11130, 5, 6, 73, 3],
      decoded: "[CLS] let a = obj.tostring(); tostring();[SEP]",
    },
    NEWLINES: {
      text: BASE_TEST_STRINGS.NEWLINES,
      tokens: ["\u2581this", "\u2581is", "\u2581a", "\u2581test", "."],
      ids: [2, 48, 25, 21, 1289, 9, 3],
      decoded: "[CLS] this is a test.[SEP]",
    },
    BASIC: {
      text: BASE_TEST_STRINGS.BASIC,
      tokens: ["\u2581unwanted", ",", "running"],
      ids: [2, 21095, 15, 11325, 3],
      decoded: "[CLS] unwanted,running[SEP]",
    },
    CONTROL_TOKENS: {
      text: BASE_TEST_STRINGS.CONTROL_TOKENS,
      tokens: ["\u25811", "\u0000", "2", "\u25813"],
      ids: [2, 137, 1, 135, 203, 3],
      decoded: "[CLS] 1<unk>2 3[SEP]",
    },
    HELLO_WORLD_TITLECASE: {
      text: BASE_TEST_STRINGS.HELLO_WORLD_TITLECASE,
      tokens: ["\u2581hello", "\u2581world"],
      ids: [2, 10975, 126, 3],
      decoded: "[CLS] hello world[SEP]",
    },
    HELLO_WORLD_LOWERCASE: {
      text: BASE_TEST_STRINGS.HELLO_WORLD_LOWERCASE,
      tokens: ["\u2581hello", "\u2581world"],
      ids: [2, 10975, 126, 3],
      decoded: "[CLS] hello world[SEP]",
    },
    CHINESE_ONLY: {
      text: BASE_TEST_STRINGS.CHINESE_ONLY,
      tokens: ["\u2581", "\u751f\u6d3b\u7684\u771f\u8c1b\u662f"],
      ids: [2, 13, 1, 3],
      decoded: "[CLS] <unk>[SEP]",
    },
    LEADING_SPACE: {
      text: BASE_TEST_STRINGS.LEADING_SPACE,
      tokens: ["\u2581leading", "\u2581space"],
      ids: [2, 1005, 726, 3],
      decoded: "[CLS] leading space[SEP]",
    },
    TRAILING_SPACE: {
      text: BASE_TEST_STRINGS.TRAILING_SPACE,
      tokens: ["\u2581trailing", "\u2581space"],
      ids: [2, 14323, 726, 3],
      decoded: "[CLS] trailing space[SEP]",
    },
    DOUBLE_SPACE: {
      text: BASE_TEST_STRINGS.DOUBLE_SPACE,
      tokens: ["\u2581hi", "\u2581hello"],
      ids: [2, 4148, 10975, 3],
      decoded: "[CLS] hi hello[SEP]",
    },
    CURRENCY: {
      text: BASE_TEST_STRINGS.CURRENCY,
      tokens: ["\u2581test", "\u2581$1", "\u2581r", "2", "\u2581#3", "\u2581", "\u20ac", "4", "\u2581", "\u00a3", "5", "\u2581", "\u00a5", "6", "\u2581", "\u20a3", "7", "\u2581", "\u20b9", "8", "\u2581", "\u20b1", "9", "\u2581test"],
      ids: [2, 1289, 3742, 761, 135, 11489, 13, 12, 300, 13, 11, 264, 13, 1, 379, 13, 1, 465, 13, 1, 457, 13, 1, 518, 1289, 3],
      decoded: "[CLS] test $1 r2 #3 \u20ac4 \u00a35 <unk>6 <unk>7 <unk>8 <unk>9 test[SEP]",
    },
    CURRENCY_WITH_DECIMALS: {
      text: BASE_TEST_STRINGS.CURRENCY_WITH_DECIMALS,
      tokens: ["\u2581i", "\u2581bought", "\u2581an", "\u2581apple", "\u2581for", "\u2581$1", ".", "00", "\u2581at", "\u2581the", "\u2581store", "."],
      ids: [2, 31, 2448, 40, 4037, 26, 3742, 9, 2032, 35, 14, 1718, 9, 3],
      decoded: "[CLS] i bought an apple for $1.00 at the store.[SEP]",
    },
    ELLIPSIS: {
      text: BASE_TEST_STRINGS.ELLIPSIS,
      tokens: ["\u2581you", ".", ".", "."],
      ids: [2, 42, 9, 9, 9, 3],
      decoded: "[CLS] you...[SEP]",
    },
    TEXT_WITH_ESCAPE_CHARACTERS: {
      text: BASE_TEST_STRINGS.TEXT_WITH_ESCAPE_CHARACTERS,
      tokens: ["\u2581you", ".", ".", "."],
      ids: [2, 42, 9, 9, 9, 3],
      decoded: "[CLS] you...[SEP]",
    },
    TEXT_WITH_ESCAPE_CHARACTERS_2: {
      text: BASE_TEST_STRINGS.TEXT_WITH_ESCAPE_CHARACTERS_2,
      tokens: ["\u2581you", ".", ".", ".", "\u2581you", ".", ".", "."],
      ids: [2, 42, 9, 9, 9, 42, 9, 9, 9, 3],
      decoded: "[CLS] you... you...[SEP]",
    },
    TILDE_NORMALIZATION: {
      text: BASE_TEST_STRINGS.TILDE_NORMALIZATION,
      tokens: ["\u2581weird", "\u2581", "~", "\u2581edge", "\u2581", "~", "\u2581case"],
      ids: [2, 5455, 13, 1, 1407, 13, 1, 610, 3],
      decoded: "[CLS] weird <unk> edge <unk> case[SEP]",
    },
    SPIECE_UNDERSCORE: {
      text: BASE_TEST_STRINGS.SPIECE_UNDERSCORE,
      tokens: ["\u2581this", "\u2581is", "\u2581a", "\u2581test", "\u2581", "."],
      ids: [2, 48, 25, 21, 1289, 13, 9, 3],
      decoded: "[CLS] this is a test.[SEP]",
    },
    POPULAR_EMOJIS: {
      text: BASE_TEST_STRINGS.POPULAR_EMOJIS,
      tokens: ["\u2581", "\ud83d\ude02", "\u2581", "\ud83d\udc4d", "\u2581", "\ud83e\udd23", "\u2581", "\ud83d\ude0d", "\u2581", "\ud83d\ude2d", "\u2581", "\ud83c\udf89", "\u2581", "\ud83d\ude4f", "\u2581", "\ud83d\ude0a", "\u2581", "\ud83d\udd25", "\u2581", "\ud83d\ude01", "\u2581", "\ud83d\ude05", "\u2581", "\ud83e\udd17", "\u2581", "\ud83d\ude06", "\u2581", "\ud83d\udc4f", "\u2581", "\u2764", "\u2581", "\ud83d\udc9c", "\u2581", "\ud83d\udc9a", "\u2581", "\ud83d\udc97", "\u2581", "\ud83d\udc99", "\u2581", "\ud83d\udda4", "\u2581", "\ud83d\ude0e", "\u2581", "\ud83d\udc4c", "\u2581", "\ud83e\udd73", "\u2581", "\ud83d\udcaa", "\u2581", "\u2728", "\u2581", "\ud83d\udc49", "\u2581", "\ud83d\udc40", "\u2581", "\ud83d\udcaf", "\u2581", "\ud83c\udf88", "\u2581", "\ud83d\ude48", "\u2581", "\ud83d\ude4c", "\u2581", "\ud83d\udc80", "\u2581", "\ud83d\udc47", "\u2581", "\ud83d\udc4b", "\u2581", "\u2705", "\u2581", "\ud83c\udf81", "\u2581", "\ud83c\udf1e", "\u2581", "\ud83c\udf38", "\u2581", "\ud83d\udcb0"],
      ids: [2, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 3],
      decoded: "[CLS] <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk>[SEP]",
    },
    MULTIBYTE_EMOJIS: {
      text: BASE_TEST_STRINGS.MULTIBYTE_EMOJIS,
      tokens: ["\u2581", "\u2728", "\u2581", "\ud83e\udd17", "\u2581", "\ud83d\udc41", "\u2581", "\ud83d\udc71\ud83c\udffb", "\u2581", "\ud83d\udd75", "\u2581", "\u2642", "\u2581", "\ud83e\uddd9\ud83c\udffb", "\u2581", "\u2642", "\u2581", "\ud83d\udc68\ud83c\udffb", "\u2581", "\ud83c\udf3e", "\u2581", "\ud83e\uddd1", "\u2581", "\ud83e\udd1d", "\u2581", "\ud83e\uddd1", "\u2581", "\ud83d\udc69", "\u2581", "\u2764", "\u2581", "\ud83d\udc8b", "\u2581", "\ud83d\udc68", "\u2581", "\ud83d\udc69", "\u2581", "\ud83d\udc69", "\u2581", "\ud83d\udc67", "\u2581", "\ud83d\udc66", "\u2581", "\ud83e\uddd1\ud83c\udffb", "\u2581", "\ud83e\udd1d", "\u2581", "\ud83e\uddd1\ud83c\udffb", "\u2581", "\ud83c\udff4\udb40\udc67\udb40\udc62\udb40\udc65\udb40\udc6e\udb40\udc67\udb40\udc7f", "\u2581", "\ud83d\udc68\ud83c\udffb", "\u2581", "\u2764", "\u2581", "\ud83d\udc8b", "\u2581", "\ud83d\udc68\ud83c\udffc"],
      ids: [2, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 3],
      decoded: "[CLS] <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk>[SEP]",
    },
    CHINESE_LATIN_MIXED: {
      text: BERT_TEST_STRINGS.CHINESE_LATIN_MIXED,
      tokens: ["\u2581", "ah", "\u535a\u63a8", "zz"],
      ids: [2, 13, 1307, 1, 5092, 3],
      decoded: "[CLS] ah<unk>zz[SEP]",
    },
    SIMPLE_WITH_ACCENTS: {
      text: BERT_TEST_STRINGS.SIMPLE_WITH_ACCENTS,
      tokens: ["\u2581hello"],
      ids: [2, 10975, 3],
      decoded: "[CLS] hello[SEP]",
    },
    MIXED_CASE_WITHOUT_ACCENTS: {
      text: BERT_TEST_STRINGS.MIXED_CASE_WITHOUT_ACCENTS,
      tokens: ["\u2581hello", "!", "how", "\u2581are", "\u2581you", "?"],
      ids: [2, 10975, 187, 1544, 50, 42, 60, 3],
      decoded: "[CLS] hello!how are you?[SEP]",
    },
    MIXED_CASE_WITH_ACCENTS: {
      text: BERT_TEST_STRINGS.MIXED_CASE_WITH_ACCENTS,
      tokens: ["\u2581hall", "o", "!", "how", "\u2581are", "\u2581you", "?"],
      ids: [2, 554, 111, 187, 1544, 50, 42, 60, 3],
      decoded: "[CLS] hallo!how are you?[SEP]",
    },
  },
};