sample_tokenizer2 / tokenizer.json
pradeep4321's picture
Upload tokenizer
56646ff
raw
history blame
10.5 kB
{
"version": "1.0",
"truncation": null,
"padding": null,
"added_tokens": [
{
"id": 0,
"content": "<|endoftext|>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
}
],
"normalizer": null,
"pre_tokenizer": {
"type": "ByteLevel",
"add_prefix_space": false,
"trim_offsets": true,
"use_regex": true
},
"post_processor": {
"type": "ByteLevel",
"add_prefix_space": true,
"trim_offsets": false,
"use_regex": true
},
"decoder": {
"type": "ByteLevel",
"add_prefix_space": true,
"trim_offsets": true,
"use_regex": true
},
"model": {
"type": "BPE",
"dropout": null,
"unk_token": null,
"continuing_subword_prefix": "",
"end_of_word_suffix": "",
"fuse_unk": false,
"byte_fallback": false,
"vocab": {
"<|endoftext|>": 0,
"!": 1,
"\"": 2,
"#": 3,
"$": 4,
"%": 5,
"&": 6,
"'": 7,
"(": 8,
")": 9,
"*": 10,
"+": 11,
",": 12,
"-": 13,
".": 14,
"/": 15,
"0": 16,
"1": 17,
"2": 18,
"3": 19,
"4": 20,
"5": 21,
"6": 22,
"7": 23,
"8": 24,
"9": 25,
":": 26,
";": 27,
"<": 28,
"=": 29,
">": 30,
"?": 31,
"@": 32,
"A": 33,
"B": 34,
"C": 35,
"D": 36,
"E": 37,
"F": 38,
"G": 39,
"H": 40,
"I": 41,
"J": 42,
"K": 43,
"L": 44,
"M": 45,
"N": 46,
"O": 47,
"P": 48,
"Q": 49,
"R": 50,
"S": 51,
"T": 52,
"U": 53,
"V": 54,
"W": 55,
"X": 56,
"Y": 57,
"Z": 58,
"[": 59,
"\\": 60,
"]": 61,
"^": 62,
"_": 63,
"`": 64,
"a": 65,
"b": 66,
"c": 67,
"d": 68,
"e": 69,
"f": 70,
"g": 71,
"h": 72,
"i": 73,
"j": 74,
"k": 75,
"l": 76,
"m": 77,
"n": 78,
"o": 79,
"p": 80,
"q": 81,
"r": 82,
"s": 83,
"t": 84,
"u": 85,
"v": 86,
"w": 87,
"x": 88,
"y": 89,
"z": 90,
"{": 91,
"|": 92,
"}": 93,
"~": 94,
"¡": 95,
"¢": 96,
"£": 97,
"¤": 98,
"¥": 99,
"¦": 100,
"§": 101,
"¨": 102,
"©": 103,
"ª": 104,
"«": 105,
"¬": 106,
"®": 107,
"¯": 108,
"°": 109,
"±": 110,
"²": 111,
"³": 112,
"´": 113,
"µ": 114,
"¶": 115,
"·": 116,
"¸": 117,
"¹": 118,
"º": 119,
"»": 120,
"¼": 121,
"½": 122,
"¾": 123,
"¿": 124,
"À": 125,
"Á": 126,
"Â": 127,
"Ã": 128,
"Ä": 129,
"Å": 130,
"Æ": 131,
"Ç": 132,
"È": 133,
"É": 134,
"Ê": 135,
"Ë": 136,
"Ì": 137,
"Í": 138,
"Î": 139,
"Ï": 140,
"Ð": 141,
"Ñ": 142,
"Ò": 143,
"Ó": 144,
"Ô": 145,
"Õ": 146,
"Ö": 147,
"×": 148,
"Ø": 149,
"Ù": 150,
"Ú": 151,
"Û": 152,
"Ü": 153,
"Ý": 154,
"Þ": 155,
"ß": 156,
"à": 157,
"á": 158,
"â": 159,
"ã": 160,
"ä": 161,
"å": 162,
"æ": 163,
"ç": 164,
"è": 165,
"é": 166,
"ê": 167,
"ë": 168,
"ì": 169,
"í": 170,
"î": 171,
"ï": 172,
"ð": 173,
"ñ": 174,
"ò": 175,
"ó": 176,
"ô": 177,
"õ": 178,
"ö": 179,
"÷": 180,
"ø": 181,
"ù": 182,
"ú": 183,
"û": 184,
"ü": 185,
"ý": 186,
"þ": 187,
"ÿ": 188,
"Ā": 189,
"ā": 190,
"Ă": 191,
"ă": 192,
"Ą": 193,
"ą": 194,
"Ć": 195,
"ć": 196,
"Ĉ": 197,
"ĉ": 198,
"Ċ": 199,
"ċ": 200,
"Č": 201,
"č": 202,
"Ď": 203,
"ď": 204,
"Đ": 205,
"đ": 206,
"Ē": 207,
"ē": 208,
"Ĕ": 209,
"ĕ": 210,
"Ė": 211,
"ė": 212,
"Ę": 213,
"ę": 214,
"Ě": 215,
"ě": 216,
"Ĝ": 217,
"ĝ": 218,
"Ğ": 219,
"ğ": 220,
"Ġ": 221,
"ġ": 222,
"Ģ": 223,
"ģ": 224,
"Ĥ": 225,
"ĥ": 226,
"Ħ": 227,
"ħ": 228,
"Ĩ": 229,
"ĩ": 230,
"Ī": 231,
"ī": 232,
"Ĭ": 233,
"ĭ": 234,
"Į": 235,
"į": 236,
"İ": 237,
"ı": 238,
"IJ": 239,
"ij": 240,
"Ĵ": 241,
"ĵ": 242,
"Ķ": 243,
"ķ": 244,
"ĸ": 245,
"Ĺ": 246,
"ĺ": 247,
"Ļ": 248,
"ļ": 249,
"Ľ": 250,
"ľ": 251,
"Ŀ": 252,
"ŀ": 253,
"Ł": 254,
"ł": 255,
"Ń": 256,
"or": 257,
"er": 258,
"at": 259,
"in": 260,
"is": 261,
"on": 262,
"Ġo": 263,
"Ġa": 264,
"Ġis": 265,
"al": 266,
"for": 267,
"ion": 268,
"re": 269,
"tr": 270,
"Ġc": 271,
"Ġm": 272,
"Ġs": 273,
"Ġfor": 274,
"ation": 275,
"Ġof": 276,
"an": 277,
"bu": 278,
"ct": 279,
"ed": 280,
"ic": 281,
"ier": 282,
"lier": 283,
"pp": 284,
"pr": 285,
"per": 286,
"rs": 287,
"srs": 288,
"th": 289,
"upp": 290,
"ys": 291,
"Ġj": 292,
"Ġth": 293,
"ors": 294,
"ators": 295,
"int": 296,
"Ġare": 297,
"pro": 298,
"perators": 299,
"upplier": 300,
"In": 301,
"Jsrs": 302,
"Su": 303,
"aj": 304,
"ays": 305,
"bs": 306,
"du": 307,
"dy": 308,
"em": 309,
"en": 310,
"es": 311,
"ew": 312,
"ey": 313,
"ean": 314,
"ect": 315,
"fic": 316,
"gi": 317,
"gs": 318,
"gat": 319,
"gis": 320,
"hic": 321,
"ibu": 322,
"idy": 323,
"ien": 324,
"ific": 325,
"ject": 326,
"ked": 327,
"lin": 328,
"man": 329,
"ntr": 330,
"nation": 331,
"ou": 332,
"oint": 333,
"operators": 334,
"sin": 335,
"supplier": 336,
"sgat": 337,
"tor": 338,
"ted": 339,
"tem": 340,
"tific": 341,
"ue": 342,
"ver": 343,
"val": 344,
"whic": 345,
"Ġor": 346,
"Ġre": 347,
"Ġbu": 348,
"Ġint": 349,
"Ġpro": 350,
"Ġgi": 351,
"Ġnation": 352,
"Ġval": 353,
"Ġwhic": 354,
"erlin": 355,
"ertific": 356,
"ings": 357,
"ontr": 358,
"oney": 359,
"Ġoperators": 360,
"Ġoman": 361,
"ale": 362,
"als": 363,
"tration": 364,
"Ġcou": 365,
"Ġcertific": 366,
"Ġcontr": 367,
"Ġmaj": 368,
"Ġmean": 369,
"Ġmoney": 370,
"Ġsys": 371,
"Ġsupplier": 372,
"Ġsale": 373,
"cts": 374,
"Ġjsrs": 375,
"Ġjoint": 376,
"Ġthe": 377,
"Ġthings": 378,
"produ": 379,
"Subs": 380,
"essgat": 381,
"eways": 382,
"gistration": 383,
"ibutor": 384,
"iented": 385,
"ntry": 386,
"sinessgat": 387,
"suppliers": 388,
"Ġoriented": 389,
"Ġregistration": 390,
"Ġbusinessgat": 391,
"Ġinterlin": 392,
"Ġproject": 393,
"Ġgiver": 394,
"Ġnationals": 395,
"Ġvalue": 396,
"Ġwhich": 397,
"Ġcountry": 398,
"Ġcertification": 399,
"Ġcontributor": 400,
"Ġmajor": 401,
"Ġmeans": 402,
"Ġsystem": 403,
"products": 404,
"Subsidy": 405,
"Ġbusinessgateways": 406,
"Ġinterlinked": 407
},
"merges": [
"o r",
"e r",
"a t",
"i n",
"i s",
"o n",
"Ġ o",
"Ġ a",
"Ġ is",
"a l",
"f or",
"i on",
"r e",
"t r",
"Ġ c",
"Ġ m",
"Ġ s",
"Ġ for",
"at ion",
"Ġo f",
"a n",
"b u",
"c t",
"e d",
"i c",
"i er",
"l ier",
"p p",
"p r",
"p er",
"r s",
"s rs",
"t h",
"u pp",
"y s",
"Ġ j",
"Ġ th",
"or s",
"at ors",
"in t",
"Ġa re",
"pr o",
"per ators",
"upp lier",
"I n",
"J srs",
"S u",
"a j",
"a ys",
"b s",
"d u",
"d y",
"e m",
"e n",
"e s",
"e w",
"e y",
"e an",
"e ct",
"f ic",
"g i",
"g s",
"g at",
"g is",
"h ic",
"i bu",
"i dy",
"i en",
"i fic",
"j ect",
"k ed",
"l in",
"m an",
"n tr",
"n ation",
"o u",
"o int",
"o perators",
"s in",
"s upplier",
"s gat",
"t or",
"t ed",
"t em",
"t ific",
"u e",
"v er",
"v al",
"w hic",
"Ġ or",
"Ġ re",
"Ġ bu",
"Ġ int",
"Ġ pro",
"Ġ gi",
"Ġ nation",
"Ġ val",
"Ġ whic",
"er lin",
"er tific",
"in gs",
"on tr",
"on ey",
"Ġo perators",
"Ġo man",
"al e",
"al s",
"tr ation",
"Ġc ou",
"Ġc ertific",
"Ġc ontr",
"Ġm aj",
"Ġm ean",
"Ġm oney",
"Ġs ys",
"Ġs upplier",
"Ġs ale",
"ct s",
"Ġj srs",
"Ġj oint",
"Ġth e",
"Ġth ings",
"pro du",
"Su bs",
"es sgat",
"ew ays",
"gis tration",
"ibu tor",
"ien ted",
"ntr y",
"sin essgat",
"supplier s",
"Ġor iented",
"Ġre gistration",
"Ġbu sinessgat",
"Ġint erlin",
"Ġpro ject",
"Ġgi ver",
"Ġnation als",
"Ġval ue",
"Ġwhic h",
"Ġcou ntry",
"Ġcertific ation",
"Ġcontr ibutor",
"Ġmaj or",
"Ġmean s",
"Ġsys tem",
"produ cts",
"Subs idy",
"Ġbusinessgat eways",
"Ġinterlin ked"
]
}
}