char128_shift_tokenizer / tokenizer.json
Corianas's picture
Update tokenizer.json
c1010b0 verified
{
"version": "1.0",
"truncation": null,
"padding": null,
"added_tokens": [
{
"id": 0,
"content": "<pad>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 1,
"content": "<unk>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 2,
"content": "<bos>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 3,
"content": "<eos>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
}
],
"normalizer": {
"type": "Sequence",
"normalizers": [
{
"type": "NFKC"
},
{
"type": "Replace",
"pattern": {
"Regex": "A"
},
"content": "↨a"
},
{
"type": "Replace",
"pattern": {
"Regex": "B"
},
"content": "↨b"
},
{
"type": "Replace",
"pattern": {
"Regex": "C"
},
"content": "↨c"
},
{
"type": "Replace",
"pattern": {
"Regex": "D"
},
"content": "↨d"
},
{
"type": "Replace",
"pattern": {
"Regex": "E"
},
"content": "↨e"
},
{
"type": "Replace",
"pattern": {
"Regex": "F"
},
"content": "↨f"
},
{
"type": "Replace",
"pattern": {
"Regex": "G"
},
"content": "↨g"
},
{
"type": "Replace",
"pattern": {
"Regex": "H"
},
"content": "↨h"
},
{
"type": "Replace",
"pattern": {
"Regex": "I"
},
"content": "↨i"
},
{
"type": "Replace",
"pattern": {
"Regex": "J"
},
"content": "↨j"
},
{
"type": "Replace",
"pattern": {
"Regex": "K"
},
"content": "↨k"
},
{
"type": "Replace",
"pattern": {
"Regex": "L"
},
"content": "↨l"
},
{
"type": "Replace",
"pattern": {
"Regex": "M"
},
"content": "↨m"
},
{
"type": "Replace",
"pattern": {
"Regex": "N"
},
"content": "↨n"
},
{
"type": "Replace",
"pattern": {
"Regex": "O"
},
"content": "↨o"
},
{
"type": "Replace",
"pattern": {
"Regex": "P"
},
"content": "↨p"
},
{
"type": "Replace",
"pattern": {
"Regex": "Q"
},
"content": "↨q"
},
{
"type": "Replace",
"pattern": {
"Regex": "R"
},
"content": "↨r"
},
{
"type": "Replace",
"pattern": {
"Regex": "S"
},
"content": "↨s"
},
{
"type": "Replace",
"pattern": {
"Regex": "T"
},
"content": "↨t"
},
{
"type": "Replace",
"pattern": {
"Regex": "U"
},
"content": "↨u"
},
{
"type": "Replace",
"pattern": {
"Regex": "V"
},
"content": "↨v"
},
{
"type": "Replace",
"pattern": {
"Regex": "W"
},
"content": "↨w"
},
{
"type": "Replace",
"pattern": {
"Regex": "X"
},
"content": "↨x"
},
{
"type": "Replace",
"pattern": {
"Regex": "Y"
},
"content": "↨y"
},
{
"type": "Replace",
"pattern": {
"Regex": "Z"
},
"content": "↨z"
}
]
},
"pre_tokenizer": {
"type": "Split",
"pattern": {
"Regex": "\\X"
},
"behavior": "Isolated",
"invert": false
},
"post_processor": {
"type": "TemplateProcessing",
"single": [
{
"Sequence": {
"id": "A",
"type_id": 0
}
}
],
"pair": [
{
"Sequence": {
"id": "A",
"type_id": 0
}
},
{
"Sequence": {
"id": "B",
"type_id": 0
}
}
],
"special_tokens": {}
},
"decoder": {
"type": "Sequence",
"decoders": []
},
"model": {
"type": "WordLevel",
"vocab": {
"<pad>": 0,
"<unk>": 1,
"<bos>": 2,
"<eos>": 3,
"↨": 4,
"\n": 5,
"\t": 6,
" ": 7,
"0": 8,
"1": 9,
"2": 10,
"3": 11,
"4": 12,
"5": 13,
"6": 14,
"7": 15,
"8": 16,
"9": 17,
"a": 18,
"b": 19,
"c": 20,
"d": 21,
"e": 22,
"f": 23,
"g": 24,
"h": 25,
"i": 26,
"j": 27,
"k": 28,
"l": 29,
"m": 30,
"n": 31,
"o": 32,
"p": 33,
"q": 34,
"r": 35,
"s": 36,
"t": 37,
"u": 38,
"v": 39,
"w": 40,
"x": 41,
"y": 42,
"z": 43,
"\"": 44,
"!": 45,
"$": 46,
"&": 47,
"'": 48,
"#": 49,
",": 50,
"/": 51,
"+": 52,
"=": 53,
"-": 54,
"<": 55,
">": 56,
"*": 57,
"@": 58,
".": 59,
":": 60,
";": 61,
"[": 62,
"]": 63,
"{": 64,
"}": 65,
"(": 66,
")": 67,
"^": 68,
"_": 69,
"?": 70,
"%": 71,
"é": 72,
"¤69": 73,
"¤70": 74,
"¤71": 75,
"¤72": 76,
"¤73": 77,
"¤74": 78,
"¤75": 79,
"¤76": 80,
"¤77": 81,
"¤78": 82,
"¤79": 83,
"¤80": 84,
"¤81": 85,
"¤82": 86,
"¤83": 87,
"¤84": 88,
"¤85": 89,
"¤86": 90,
"¤87": 91,
"¤88": 92,
"¤89": 93,
"¤90": 94,
"¤91": 95,
"¤92": 96,
"¤93": 97,
"¤94": 98,
"¤95": 99,
"¤96": 100,
"¤97": 101,
"¤98": 102,
"¤99": 103,
"¤100": 104,
"¤101": 105,
"¤102": 106,
"¤103": 107,
"¤104": 108,
"¤105": 109,
"¤106": 110,
"¤107": 111,
"¤108": 112,
"¤109": 113,
"¤110": 114,
"¤111": 115,
"¤112": 116,
"¤113": 117,
"¤114": 118,
"¤115": 119,
"¤116": 120,
"¤117": 121,
"¤118": 122,
"¤119": 123,
"¤120": 124,
"¤121": 125,
"¤122": 126,
"¤123": 127
},
"unk_token": "<unk>"
}
}