tokenizer-parity-v1 / bert /decoded.json
dollspace's picture
feat: pin ferrotorch-tokenize parity fixtures v1 (#1168)
f41659a verified
{
"decode_with_special_keep": [
"[CLS] hello, world! [SEP]",
"[CLS] the quick brown fox jumps over the lazy dog. the quick brown fox jumps over the lazy dog. the quick brown fox jumps over the lazy dog. [SEP]",
"[CLS] 日 本 語 のテスト [UNK] emoji [SEP]",
"[CLS] indented text [SEP]",
"[CLS] def foo ( x ) : return x + 1 [SEP]",
"[CLS] < | begin _ of _ text | > hello < | end _ of _ text | > [SEP]",
"[CLS] [CLS] sentence a [SEP] sentence b [SEP] [SEP]",
"[CLS] [SEP]",
"[CLS] a [SEP]",
"[CLS] leading and trailing [SEP]",
"[CLS] mixed 123 with numbers 4567 and symbols! @ # $ % ^ & * ( ) [SEP]",
"[CLS] newline three [SEP]",
"[CLS] tab tab tab [SEP]",
"[CLS] quote \" double \" and ' single ' and ` backtick ` [SEP]",
"[CLS] url : https : / / example. com / path? query = value & other = 1 [SEP]",
"[CLS] email : alice @ example. com, bob @ foo. io [SEP]",
"[CLS] 中 文 [UNK] [UNK] with english mixed [SEP]",
"[CLS] repeating aaaaaaaaaaaa and bbbbbbbbbbbb [SEP]",
"[CLS] emoji rain [UNK] and stars [UNK] [SEP]",
"[CLS] code : ` int main ( ) { return 0 ; } ` [SEP]"
],
"decode_with_special_skip": [
"hello, world!",
"the quick brown fox jumps over the lazy dog. the quick brown fox jumps over the lazy dog. the quick brown fox jumps over the lazy dog.",
"日 本 語 のテスト emoji",
"indented text",
"def foo ( x ) : return x + 1",
"< | begin _ of _ text | > hello < | end _ of _ text | >",
"sentence a sentence b",
"",
"a",
"leading and trailing",
"mixed 123 with numbers 4567 and symbols! @ # $ % ^ & * ( )",
"newline three",
"tab tab tab",
"quote \" double \" and ' single ' and ` backtick `",
"url : https : / / example. com / path? query = value & other = 1",
"email : alice @ example. com, bob @ foo. io",
"中 文 with english mixed",
"repeating aaaaaaaaaaaa and bbbbbbbbbbbb",
"emoji rain and stars",
"code : ` int main ( ) { return 0 ; } `"
],
"decode_no_special": [
"hello, world!",
"the quick brown fox jumps over the lazy dog. the quick brown fox jumps over the lazy dog. the quick brown fox jumps over the lazy dog.",
"日 本 語 のテスト [UNK] emoji",
"indented text",
"def foo ( x ) : return x + 1",
"< | begin _ of _ text | > hello < | end _ of _ text | >",
"[CLS] sentence a [SEP] sentence b [SEP]",
"",
"a",
"leading and trailing",
"mixed 123 with numbers 4567 and symbols! @ # $ % ^ & * ( )",
"newline three",
"tab tab tab",
"quote \" double \" and ' single ' and ` backtick `",
"url : https : / / example. com / path? query = value & other = 1",
"email : alice @ example. com, bob @ foo. io",
"中 文 [UNK] [UNK] with english mixed",
"repeating aaaaaaaaaaaa and bbbbbbbbbbbb",
"emoji rain [UNK] and stars [UNK]",
"code : ` int main ( ) { return 0 ; } `"
]
}