Transformers
dp-tokenizer / tokenizer.json
Eli2381's picture
Upload tokenizer
18c6ab3 verified
{
"version": "1.0",
"truncation": null,
"padding": null,
"added_tokens": [
{
"id": 0,
"content": "aha",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 1,
"content": "wait",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 2,
"content": "<|endoftext|>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": true,
"special": true
},
{
"id": 3,
"content": "BoS",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 4,
"content": "EoS",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 5,
"content": "UNK",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 6,
"content": "PAD",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 7,
"content": "EoT",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 8,
"content": "BoT",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
}
],
"normalizer": null,
"pre_tokenizer": {
"type": "ByteLevel",
"add_prefix_space": false,
"trim_offsets": true,
"use_regex": true
},
"post_processor": {
"type": "ByteLevel",
"add_prefix_space": true,
"trim_offsets": false,
"use_regex": true
},
"decoder": {
"type": "ByteLevel",
"add_prefix_space": true,
"trim_offsets": true,
"use_regex": true
},
"model": {
"type": "BPE",
"dropout": null,
"unk_token": null,
"continuing_subword_prefix": "",
"end_of_word_suffix": "",
"fuse_unk": false,
"byte_fallback": false,
"ignore_merges": false,
"vocab": {
"aha": 0,
"wait": 1,
"<|endoftext|>": 2,
"BoS": 3,
"EoS": 4,
"UNK": 5,
"PAD": 6,
"EoT": 7,
"BoT": 8,
"!": 9,
"\"": 10,
"#": 11,
"$": 12,
"%": 13,
"&": 14,
"'": 15,
"(": 16,
")": 17,
"*": 18,
"+": 19,
",": 20,
"-": 21,
".": 22,
"/": 23,
"0": 24,
"1": 25,
"2": 26,
"3": 27,
"4": 28,
"5": 29,
"6": 30,
"7": 31,
"8": 32,
"9": 33,
":": 34,
";": 35,
"<": 36,
"=": 37,
">": 38,
"?": 39,
"@": 40,
"A": 41,
"B": 42,
"C": 43,
"D": 44,
"E": 45,
"F": 46,
"G": 47,
"H": 48,
"I": 49,
"J": 50,
"K": 51,
"L": 52,
"M": 53,
"N": 54,
"O": 55,
"P": 56,
"Q": 57,
"R": 58,
"S": 59,
"T": 60,
"U": 61,
"V": 62,
"W": 63,
"X": 64,
"Y": 65,
"Z": 66,
"[": 67,
"\\": 68,
"]": 69,
"^": 70,
"_": 71,
"`": 72,
"a": 73,
"b": 74,
"c": 75,
"d": 76,
"e": 77,
"f": 78,
"g": 79,
"h": 80,
"i": 81,
"j": 82,
"k": 83,
"l": 84,
"m": 85,
"n": 86,
"o": 87,
"p": 88,
"q": 89,
"r": 90,
"s": 91,
"t": 92,
"u": 93,
"v": 94,
"w": 95,
"x": 96,
"y": 97,
"z": 98,
"{": 99,
"|": 100,
"}": 101,
"~": 102,
"¡": 103,
"¢": 104,
"£": 105,
"¤": 106,
"¥": 107,
"¦": 108,
"§": 109,
"¨": 110,
"©": 111,
"ª": 112,
"«": 113,
"¬": 114,
"®": 115,
"¯": 116,
"°": 117,
"±": 118,
"²": 119,
"³": 120,
"´": 121,
"µ": 122,
"¶": 123,
"·": 124,
"¸": 125,
"¹": 126,
"º": 127,
"»": 128,
"¼": 129,
"½": 130,
"¾": 131,
"¿": 132,
"À": 133,
"Á": 134,
"Â": 135,
"Ã": 136,
"Ä": 137,
"Å": 138,
"Æ": 139,
"Ç": 140,
"È": 141,
"É": 142,
"Ê": 143,
"Ë": 144,
"Ì": 145,
"Í": 146,
"Î": 147,
"Ï": 148,
"Ð": 149,
"Ñ": 150,
"Ò": 151,
"Ó": 152,
"Ô": 153,
"Õ": 154,
"Ö": 155,
"×": 156,
"Ø": 157,
"Ù": 158,
"Ú": 159,
"Û": 160,
"Ü": 161,
"Ý": 162,
"Þ": 163,
"ß": 164,
"à": 165,
"á": 166,
"â": 167,
"ã": 168,
"ä": 169,
"å": 170,
"æ": 171,
"ç": 172,
"è": 173,
"é": 174,
"ê": 175,
"ë": 176,
"ì": 177,
"í": 178,
"î": 179,
"ï": 180,
"ð": 181,
"ñ": 182,
"ò": 183,
"ó": 184,
"ô": 185,
"õ": 186,
"ö": 187,
"÷": 188,
"ø": 189,
"ù": 190,
"ú": 191,
"û": 192,
"ü": 193,
"ý": 194,
"þ": 195,
"ÿ": 196,
"Ā": 197,
"ā": 198,
"Ă": 199,
"ă": 200,
"Ą": 201,
"ą": 202,
"Ć": 203,
"ć": 204,
"Ĉ": 205,
"ĉ": 206,
"Ċ": 207,
"ċ": 208,
"Č": 209,
"č": 210,
"Ď": 211,
"ď": 212,
"Đ": 213,
"đ": 214,
"Ē": 215,
"ē": 216,
"Ĕ": 217,
"ĕ": 218,
"Ė": 219,
"ė": 220,
"Ę": 221,
"ę": 222,
"Ě": 223,
"ě": 224,
"Ĝ": 225,
"ĝ": 226,
"Ğ": 227,
"ğ": 228,
"Ġ": 229,
"ġ": 230,
"Ģ": 231,
"ģ": 232,
"Ĥ": 233,
"ĥ": 234,
"Ħ": 235,
"ħ": 236,
"Ĩ": 237,
"ĩ": 238,
"Ī": 239,
"ī": 240,
"Ĭ": 241,
"ĭ": 242,
"Į": 243,
"į": 244,
"İ": 245,
"ı": 246,
"IJ": 247,
"ij": 248,
"Ĵ": 249,
"ĵ": 250,
"Ķ": 251,
"ķ": 252,
"ĸ": 253,
"Ĺ": 254,
"ĺ": 255,
"Ļ": 256,
"ļ": 257,
"Ľ": 258,
"ľ": 259,
"Ŀ": 260,
"ŀ": 261,
"Ł": 262,
"ł": 263,
"Ń": 264,
"Ġn": 265,
"Ġ|": 266,
"ah": 267,
"Ġah": 268,
"Ġaha": 269,
"Ġ1": 270,
"Ġ4": 271,
"Ġ3": 272,
"Ġ2": 273,
"ai": 274,
"wai": 275,
"Ġwai": 276,
"Ġwait": 277,
"10": 278,
"Ġ[": 279,
"Ġ]": 280,
"Ġl": 281,
"11": 282,
"12": 283,
"Ġ6": 284,
"Ġ7": 285,
"Bo": 286,
"Eo": 287,
"ĠEo": 288,
"Ġ5": 289,
"Ġ8": 290,
"13": 291,
"Ġ9": 292,
"Ġ10": 293,
"ĠBo": 294,
"ĠEoS": 295,
"ĠEoT": 296,
"ĠBoT": 297,
"14": 298,
"Ġ11": 299,
"Ġ12": 300,
"15": 301,
"Ġ13": 302,
"Ġ14": 303,
"16": 304,
"Ġ15": 305,
"Ġ16": 306,
"17": 307,
"Ġ17": 308,
"Ġ18": 309,
"Ġ19": 310,
"Ġ20": 311
},
"merges": [
[
"Ġ",
"n"
],
[
"Ġ",
"|"
],
[
"a",
"h"
],
[
"Ġ",
"ah"
],
[
"Ġah",
"a"
],
[
"Ġ",
"1"
],
[
"Ġ",
"4"
],
[
"Ġ",
"3"
],
[
"Ġ",
"2"
],
[
"a",
"i"
],
[
"w",
"ai"
],
[
"Ġ",
"wai"
],
[
"Ġwai",
"t"
],
[
"1",
"0"
],
[
"Ġ",
"["
],
[
"Ġ",
"]"
],
[
"Ġ",
"l"
],
[
"1",
"1"
],
[
"1",
"2"
],
[
"Ġ",
"6"
],
[
"Ġ",
"7"
],
[
"B",
"o"
],
[
"E",
"o"
],
[
"Ġ",
"Eo"
],
[
"Ġ",
"5"
],
[
"Ġ",
"8"
],
[
"1",
"3"
],
[
"Ġ",
"9"
],
[
"Ġ1",
"0"
],
[
"Ġ",
"Bo"
],
[
"Bo",
"S"
],
[
"ĠEo",
"S"
],
[
"ĠEo",
"T"
],
[
"ĠBo",
"T"
],
[
"1",
"4"
],
[
"Ġ1",
"1"
],
[
"Ġ1",
"2"
],
[
"1",
"5"
],
[
"Ġ1",
"3"
],
[
"Ġ1",
"4"
],
[
"1",
"6"
],
[
"Ġ1",
"5"
],
[
"Ġ1",
"6"
],
[
"1",
"7"
],
[
"Ġ1",
"7"
],
[
"Ġ1",
"8"
],
[
"Ġ1",
"9"
],
[
"Ġ2",
"0"
]
]
}
}