sample_tokenizer3 / tokenizer.json
pradeep4321's picture
Upload tokenizer
3fe462d
raw
history blame
11.9 kB
{
"version": "1.0",
"truncation": null,
"padding": null,
"added_tokens": [
{
"id": 0,
"content": "<|endoftext|>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
}
],
"normalizer": null,
"pre_tokenizer": {
"type": "ByteLevel",
"add_prefix_space": false,
"trim_offsets": true,
"use_regex": true
},
"post_processor": {
"type": "ByteLevel",
"add_prefix_space": true,
"trim_offsets": false,
"use_regex": true
},
"decoder": {
"type": "ByteLevel",
"add_prefix_space": true,
"trim_offsets": true,
"use_regex": true
},
"model": {
"type": "BPE",
"dropout": null,
"unk_token": null,
"continuing_subword_prefix": "",
"end_of_word_suffix": "",
"fuse_unk": false,
"byte_fallback": false,
"vocab": {
"<|endoftext|>": 0,
"!": 1,
"\"": 2,
"#": 3,
"$": 4,
"%": 5,
"&": 6,
"'": 7,
"(": 8,
")": 9,
"*": 10,
"+": 11,
",": 12,
"-": 13,
".": 14,
"/": 15,
"0": 16,
"1": 17,
"2": 18,
"3": 19,
"4": 20,
"5": 21,
"6": 22,
"7": 23,
"8": 24,
"9": 25,
":": 26,
";": 27,
"<": 28,
"=": 29,
">": 30,
"?": 31,
"@": 32,
"A": 33,
"B": 34,
"C": 35,
"D": 36,
"E": 37,
"F": 38,
"G": 39,
"H": 40,
"I": 41,
"J": 42,
"K": 43,
"L": 44,
"M": 45,
"N": 46,
"O": 47,
"P": 48,
"Q": 49,
"R": 50,
"S": 51,
"T": 52,
"U": 53,
"V": 54,
"W": 55,
"X": 56,
"Y": 57,
"Z": 58,
"[": 59,
"\\": 60,
"]": 61,
"^": 62,
"_": 63,
"`": 64,
"a": 65,
"b": 66,
"c": 67,
"d": 68,
"e": 69,
"f": 70,
"g": 71,
"h": 72,
"i": 73,
"j": 74,
"k": 75,
"l": 76,
"m": 77,
"n": 78,
"o": 79,
"p": 80,
"q": 81,
"r": 82,
"s": 83,
"t": 84,
"u": 85,
"v": 86,
"w": 87,
"x": 88,
"y": 89,
"z": 90,
"{": 91,
"|": 92,
"}": 93,
"~": 94,
"¡": 95,
"¢": 96,
"£": 97,
"¤": 98,
"¥": 99,
"¦": 100,
"§": 101,
"¨": 102,
"©": 103,
"ª": 104,
"«": 105,
"¬": 106,
"®": 107,
"¯": 108,
"°": 109,
"±": 110,
"²": 111,
"³": 112,
"´": 113,
"µ": 114,
"¶": 115,
"·": 116,
"¸": 117,
"¹": 118,
"º": 119,
"»": 120,
"¼": 121,
"½": 122,
"¾": 123,
"¿": 124,
"À": 125,
"Á": 126,
"Â": 127,
"Ã": 128,
"Ä": 129,
"Å": 130,
"Æ": 131,
"Ç": 132,
"È": 133,
"É": 134,
"Ê": 135,
"Ë": 136,
"Ì": 137,
"Í": 138,
"Î": 139,
"Ï": 140,
"Ð": 141,
"Ñ": 142,
"Ò": 143,
"Ó": 144,
"Ô": 145,
"Õ": 146,
"Ö": 147,
"×": 148,
"Ø": 149,
"Ù": 150,
"Ú": 151,
"Û": 152,
"Ü": 153,
"Ý": 154,
"Þ": 155,
"ß": 156,
"à": 157,
"á": 158,
"â": 159,
"ã": 160,
"ä": 161,
"å": 162,
"æ": 163,
"ç": 164,
"è": 165,
"é": 166,
"ê": 167,
"ë": 168,
"ì": 169,
"í": 170,
"î": 171,
"ï": 172,
"ð": 173,
"ñ": 174,
"ò": 175,
"ó": 176,
"ô": 177,
"õ": 178,
"ö": 179,
"÷": 180,
"ø": 181,
"ù": 182,
"ú": 183,
"û": 184,
"ü": 185,
"ý": 186,
"þ": 187,
"ÿ": 188,
"Ā": 189,
"ā": 190,
"Ă": 191,
"ă": 192,
"Ą": 193,
"ą": 194,
"Ć": 195,
"ć": 196,
"Ĉ": 197,
"ĉ": 198,
"Ċ": 199,
"ċ": 200,
"Č": 201,
"č": 202,
"Ď": 203,
"ď": 204,
"Đ": 205,
"đ": 206,
"Ē": 207,
"ē": 208,
"Ĕ": 209,
"ĕ": 210,
"Ė": 211,
"ė": 212,
"Ę": 213,
"ę": 214,
"Ě": 215,
"ě": 216,
"Ĝ": 217,
"ĝ": 218,
"Ğ": 219,
"ğ": 220,
"Ġ": 221,
"ġ": 222,
"Ģ": 223,
"ģ": 224,
"Ĥ": 225,
"ĥ": 226,
"Ħ": 227,
"ħ": 228,
"Ĩ": 229,
"ĩ": 230,
"Ī": 231,
"ī": 232,
"Ĭ": 233,
"ĭ": 234,
"Į": 235,
"į": 236,
"İ": 237,
"ı": 238,
"IJ": 239,
"ij": 240,
"Ĵ": 241,
"ĵ": 242,
"Ķ": 243,
"ķ": 244,
"ĸ": 245,
"Ĺ": 246,
"ĺ": 247,
"Ļ": 248,
"ļ": 249,
"Ľ": 250,
"ľ": 251,
"Ŀ": 252,
"ŀ": 253,
"Ł": 254,
"ł": 255,
"Ń": 256,
"er": 257,
"Ġo": 258,
"or": 259,
"ti": 260,
"Ġi": 261,
"Ġa": 262,
"for": 263,
"gi": 264,
"he": 265,
"ma": 266,
"Ġfor": 267,
"Ġis": 268,
"on": 269,
"the": 270,
"Ġs": 271,
"Ġof": 272,
"tier": 273,
"at": 274,
"ati": 275,
"bgi": 276,
"in": 277,
"le": 278,
"ns": 279,
"ation": 280,
"fi": 281,
"js": 282,
"me": 283,
"ne": 284,
"pl": 285,
"per": 286,
"re": 287,
"rs": 288,
"st": 289,
"Ġ2": 290,
"Ġc": 291,
"Ġt": 292,
"Ġma": 293,
"Ġtier": 294,
"Ġone": 295,
"Ġoper": 296,
"Ġin": 297,
"ator": 298,
"jsrs": 299,
"Ġoperator": 300,
"al": 301,
"ay": 302,
"ans": 303,
"cation": 304,
"ed": 305,
"es": 306,
"ho": 307,
"il": 308,
"ier": 309,
"nt": 310,
"ole": 311,
"ppl": 312,
"role": 313,
"uppl": 314,
"ws": 315,
"wor": 316,
"Ġn": 317,
"Ġv": 318,
"Ġthe": 319,
"Ġbgi": 320,
"Ġns": 321,
"Ġme": 322,
"Ġre": 323,
"Ġjsrs": 324,
"Ġwor": 325,
"Ġoma": 326,
"Ġan": 327,
"gist": 328,
"Ġsho": 329,
"Ġsuppl": 330,
"fication": 331,
"Ġcer": 332,
"Ġto": 333,
"Ġmain": 334,
"ays": 335,
"Ġnss": 336,
"Ġmeans": 337,
"Ġregist": 338,
"Ġwork": 339,
"Ġoman": 340,
"Ġshows": 341,
"Ġsupplier": 342,
"02": 343,
"Af": 344,
"ab": 345,
"ar": 346,
"ax": 347,
"ail": 348,
"be": 349,
"bf": 350,
"bu": 351,
"cv": 352,
"em": 353,
"et": 354,
"ew": 355,
"gat": 356,
"her": 357,
"icv": 358,
"jo": 359,
"jor": 360,
"ll": 361,
"mo": 362,
"mp": 363,
"ot": 364,
"ou": 365,
"oon": 366,
"pr": 367,
"pet": 368,
"ry": 369,
"ration": 370,
"she": 371,
"sin": 372,
"sgat": 373,
"tw": 374,
"ter": 375,
"tfication": 376,
"ue": 377,
"um": 378,
"ves": 379,
"vail": 380,
"yst": 381,
"zoon": 382,
"Ġ1": 383,
"Ġpl": 384,
"Ġrole": 385,
"Ġbe": 386,
"Ġjo": 387,
"Ġmo": 388,
"Ġpet": 389,
"ers": 390,
"ered": 391,
"erfication": 392,
"Ġothe": 393,
"tifi": 394,
"Ġimp": 395,
"Ġare": 396,
"Ġapr": 397,
"Ġavail": 398,
"mazoon": 399,
"Ġsyst": 400,
"int": 401,
"leme": 402,
"ational": 403,
"Ġ202": 404,
"Ġcou": 405,
"Ġtax": 406,
"Ġmajor": 407,
"Ġtiers": 408,
"Ġoperators": 409,
"alue": 410,
"essgat": 411,
"ntry": 412,
"nters": 413,
"roleum": 414,
"Ġnbf": 415,
"Ġnational": 416,
"Ġverfication": 417,
"Ġvalue": 418,
"Ġand": 419,
"Ġanot": 420,
"Ġcertfication": 421,
"Ġcertifi": 422,
"Ġregistration": 423,
"Ġregistered": 424,
"Ġomani": 425,
"After": 426,
"able": 427,
"ara": 428,
"busin": 429,
"eways": 430,
"shell": 431,
"two": 432,
"Ġplays": 433,
"Ġjoint": 434,
"Ġmoves": 435,
"Ġpetroleum": 436,
"Ġimpleme": 437,
"Ġapril": 438,
"Ġavailable": 439,
"Ġsystem": 440,
"Ġ2021": 441,
"Ġcountry": 442,
"essgateways": 443,
"Ġnationals": 444,
"Ġanother": 445,
"Ġcertified": 446,
"businessgateways": 447,
"Ġimplementers": 448
},
"merges": [
"e r",
"Ġ o",
"o r",
"t i",
"Ġ i",
"Ġ a",
"f or",
"g i",
"h e",
"m a",
"Ġ for",
"Ġi s",
"o n",
"t he",
"Ġ s",
"Ġo f",
"ti er",
"a t",
"a ti",
"b gi",
"i n",
"l e",
"n s",
"ati on",
"f i",
"j s",
"m e",
"n e",
"p l",
"p er",
"r e",
"r s",
"s t",
"Ġ 2",
"Ġ c",
"Ġ t",
"Ġ ma",
"Ġ tier",
"Ġo ne",
"Ġo per",
"Ġi n",
"at or",
"js rs",
"Ġoper ator",
"a l",
"a y",
"a ns",
"c ation",
"e d",
"e s",
"h o",
"i l",
"i er",
"n t",
"o le",
"p pl",
"r ole",
"u ppl",
"w s",
"w or",
"Ġ n",
"Ġ v",
"Ġ the",
"Ġ bgi",
"Ġ ns",
"Ġ me",
"Ġ re",
"Ġ jsrs",
"Ġ wor",
"Ġo ma",
"Ġa n",
"gi st",
"Ġs ho",
"Ġs uppl",
"fi cation",
"Ġc er",
"Ġt o",
"Ġma in",
"ay s",
"Ġns s",
"Ġme ans",
"Ġre gist",
"Ġwor k",
"Ġoma n",
"Ġsho ws",
"Ġsuppl ier",
"0 2",
"A f",
"a b",
"a r",
"a x",
"a il",
"b e",
"b f",
"b u",
"c v",
"e m",
"e t",
"e w",
"g at",
"h er",
"i cv",
"j o",
"j or",
"l l",
"m o",
"m p",
"o t",
"o u",
"o on",
"p r",
"p et",
"r y",
"r ation",
"s he",
"s in",
"s gat",
"t w",
"t er",
"t fication",
"u e",
"u m",
"v es",
"v ail",
"y st",
"z oon",
"Ġ 1",
"Ġ pl",
"Ġ role",
"Ġ be",
"Ġ jo",
"Ġ mo",
"Ġ pet",
"er s",
"er ed",
"er fication",
"Ġo the",
"ti fi",
"Ġi mp",
"Ġa re",
"Ġa pr",
"Ġa vail",
"ma zoon",
"Ġs yst",
"in t",
"le me",
"ation al",
"Ġ2 02",
"Ġc ou",
"Ġt ax",
"Ġma jor",
"Ġtier s",
"Ġoperator s",
"al ue",
"es sgat",
"nt ry",
"nt ers",
"role um",
"Ġn bf",
"Ġn ational",
"Ġv erfication",
"Ġv alue",
"Ġan d",
"Ġan ot",
"Ġcer tfication",
"Ġcer tifi",
"Ġregist ration",
"Ġregist ered",
"Ġoman i",
"Af ter",
"ab le",
"ar a",
"bu sin",
"ew ays",
"she ll",
"tw o",
"Ġpl ays",
"Ġjo int",
"Ġmo ves",
"Ġpet roleum",
"Ġimp leme",
"Ġapr il",
"Ġavail able",
"Ġsyst em",
"Ġ202 1",
"Ġcou ntry",
"essgat eways",
"Ġnational s",
"Ġanot her",
"Ġcertifi ed",
"busin essgateways",
"Ġimpleme nters"
]
}
}