gpt2_mini_EN_bpe_500_42 / tokenizer.json
xiulinyang's picture
Upload folder using huggingface_hub
6d298a1 verified
{
"version": "1.0",
"truncation": null,
"padding": null,
"added_tokens": [
{
"id": 0,
"content": "<unk>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 1,
"content": "<pad>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 2,
"content": "<eos>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 500,
"content": "<|endoftext|>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
}
],
"normalizer": null,
"pre_tokenizer": {
"type": "ByteLevel",
"add_prefix_space": false,
"trim_offsets": true,
"use_regex": true
},
"post_processor": {
"type": "ByteLevel",
"add_prefix_space": true,
"trim_offsets": false,
"use_regex": true
},
"decoder": {
"type": "ByteLevel",
"add_prefix_space": true,
"trim_offsets": true,
"use_regex": true
},
"model": {
"type": "BPE",
"dropout": null,
"unk_token": null,
"continuing_subword_prefix": null,
"end_of_word_suffix": null,
"fuse_unk": false,
"byte_fallback": false,
"vocab": {
"<unk>": 0,
"<pad>": 1,
"<eos>": 2,
"!": 3,
"\"": 4,
"#": 5,
"$": 6,
"%": 7,
"&": 8,
"'": 9,
"(": 10,
")": 11,
"*": 12,
"+": 13,
",": 14,
"-": 15,
".": 16,
"/": 17,
"0": 18,
"1": 19,
"2": 20,
"3": 21,
"4": 22,
"5": 23,
"6": 24,
"7": 25,
"8": 26,
"9": 27,
":": 28,
";": 29,
"<": 30,
"=": 31,
">": 32,
"?": 33,
"@": 34,
"A": 35,
"B": 36,
"C": 37,
"D": 38,
"E": 39,
"F": 40,
"G": 41,
"H": 42,
"I": 43,
"J": 44,
"K": 45,
"L": 46,
"M": 47,
"N": 48,
"O": 49,
"P": 50,
"Q": 51,
"R": 52,
"S": 53,
"T": 54,
"U": 55,
"V": 56,
"W": 57,
"X": 58,
"Y": 59,
"Z": 60,
"[": 61,
"\\": 62,
"]": 63,
"^": 64,
"_": 65,
"`": 66,
"a": 67,
"b": 68,
"c": 69,
"d": 70,
"e": 71,
"f": 72,
"g": 73,
"h": 74,
"i": 75,
"j": 76,
"k": 77,
"l": 78,
"m": 79,
"n": 80,
"o": 81,
"p": 82,
"q": 83,
"r": 84,
"s": 85,
"t": 86,
"u": 87,
"v": 88,
"w": 89,
"x": 90,
"y": 91,
"z": 92,
"{": 93,
"|": 94,
"}": 95,
"~": 96,
"¡": 97,
"¢": 98,
"£": 99,
"¤": 100,
"¥": 101,
"¦": 102,
"§": 103,
"¨": 104,
"©": 105,
"ª": 106,
"«": 107,
"¬": 108,
"®": 109,
"¯": 110,
"°": 111,
"±": 112,
"²": 113,
"³": 114,
"´": 115,
"µ": 116,
"¶": 117,
"·": 118,
"¸": 119,
"¹": 120,
"º": 121,
"»": 122,
"¼": 123,
"½": 124,
"¾": 125,
"¿": 126,
"À": 127,
"Á": 128,
"Â": 129,
"Ã": 130,
"Ä": 131,
"Å": 132,
"Æ": 133,
"Ç": 134,
"È": 135,
"É": 136,
"Ê": 137,
"Ë": 138,
"Ì": 139,
"Í": 140,
"Î": 141,
"Ï": 142,
"Ð": 143,
"Ñ": 144,
"Ò": 145,
"Ó": 146,
"Ô": 147,
"Õ": 148,
"Ö": 149,
"×": 150,
"Ø": 151,
"Ù": 152,
"Ú": 153,
"Û": 154,
"Ü": 155,
"Ý": 156,
"Þ": 157,
"ß": 158,
"à": 159,
"á": 160,
"â": 161,
"ã": 162,
"ä": 163,
"å": 164,
"æ": 165,
"ç": 166,
"è": 167,
"é": 168,
"ê": 169,
"ë": 170,
"ì": 171,
"í": 172,
"î": 173,
"ï": 174,
"ð": 175,
"ñ": 176,
"ò": 177,
"ó": 178,
"ô": 179,
"õ": 180,
"ö": 181,
"÷": 182,
"ø": 183,
"ù": 184,
"ú": 185,
"û": 186,
"ü": 187,
"ý": 188,
"þ": 189,
"ÿ": 190,
"Ā": 191,
"ā": 192,
"Ă": 193,
"ă": 194,
"Ą": 195,
"ą": 196,
"Ć": 197,
"ć": 198,
"Ĉ": 199,
"ĉ": 200,
"Ċ": 201,
"ċ": 202,
"Č": 203,
"č": 204,
"Ď": 205,
"ď": 206,
"Đ": 207,
"đ": 208,
"Ē": 209,
"ē": 210,
"Ĕ": 211,
"ĕ": 212,
"Ė": 213,
"ė": 214,
"Ę": 215,
"ę": 216,
"Ě": 217,
"ě": 218,
"Ĝ": 219,
"ĝ": 220,
"Ğ": 221,
"ğ": 222,
"Ġ": 223,
"ġ": 224,
"Ģ": 225,
"ģ": 226,
"Ĥ": 227,
"ĥ": 228,
"Ħ": 229,
"ħ": 230,
"Ĩ": 231,
"ĩ": 232,
"Ī": 233,
"ī": 234,
"Ĭ": 235,
"ĭ": 236,
"Į": 237,
"į": 238,
"İ": 239,
"ı": 240,
"IJ": 241,
"ij": 242,
"Ĵ": 243,
"ĵ": 244,
"Ķ": 245,
"ķ": 246,
"ĸ": 247,
"Ĺ": 248,
"ĺ": 249,
"Ļ": 250,
"ļ": 251,
"Ľ": 252,
"ľ": 253,
"Ŀ": 254,
"ŀ": 255,
"Ł": 256,
"ł": 257,
"Ń": 258,
"Ġt": 259,
"he": 260,
"Ġa": 261,
"ou": 262,
"in": 263,
"Ġw": 264,
"re": 265,
"Ġs": 266,
"Ġthe": 267,
"on": 268,
"ha": 269,
"Ġy": 270,
"ĠI": 271,
"Ġyou": 272,
"Ġm": 273,
"ing": 274,
"Ġb": 275,
"Ġc": 276,
"is": 277,
"nd": 278,
"ll": 279,
"er": 280,
"Ġd": 281,
"or": 282,
"Ġto": 283,
"it": 284,
"Ġf": 285,
"en": 286,
"Ġg": 287,
"an": 288,
"st": 289,
"Ġo": 290,
"ed": 291,
"Ġp": 292,
"hat": 293,
"Ġl": 294,
"Ġn": 295,
"Ġhe": 296,
"ar": 297,
"ow": 298,
"Ġth": 299,
"om": 300,
"Ġin": 301,
"Ġand": 302,
"Ġbe": 303,
"Ġof": 304,
"ve": 305,
"'s": 306,
"Ġha": 307,
"'t": 308,
"et": 309,
"ot": 310,
"ĠT": 311,
"at": 312,
"id": 313,
"ay": 314,
"gh": 315,
"se": 316,
"ĠW": 317,
"Ġwe": 318,
"Ġit": 319,
"es": 320,
"Ġh": 321,
"ĠA": 322,
"ght": 323,
"ĠY": 324,
"Ġk": 325,
"al": 326,
"ic": 327,
"ld": 328,
"im": 329,
"Ġ-": 330,
"Ġu": 331,
"Ġis": 332,
"le": 333,
"Ġthat": 334,
"Ġst": 335,
"ut": 336,
"ĠS": 337,
"Ġon": 338,
"Ġfor": 339,
"ĠYou": 340,
"as": 341,
"ĠB": 342,
"Ġe": 343,
"ĠH": 344,
"out": 345,
"ac": 346,
"ke": 347,
"ly": 348,
"Ġme": 349,
"..": 350,
"all": 351,
"Ġre": 352,
"ust": 353,
"Ġare": 354,
"ight": 355,
"Ġr": 356,
"Ġkn": 357,
"ir": 358,
"ent": 359,
"ver": 360,
"ĠM": 361,
"Ġthis": 362,
"ĠThe": 363,
"Ġknow": 364,
"Ġgo": 365,
"Ġhave": 366,
"ell": 367,
"ill": 368,
"Ġmy": 369,
"Ġdo": 370,
"ec": 371,
"hing": 372,
"ĠD": 373,
"Ġli": 374,
"ome": 375,
"'re": 376,
"am": 377,
"au": 378,
"ho": 379,
"ro": 380,
"her": 381,
"ould": 382,
"ĠIt": 383,
"Ġyour": 384,
"Ġdon": 385,
"uc": 386,
"Ġan": 387,
"ith": 388,
"ion": 389,
"Ġwhat": 390,
"Ġwas": 391,
"ad": 392,
"ĠL": 393,
"Ġj": 394,
"Ġwith": 395,
"Ġcan": 396,
"ant": 397,
"oo": 398,
"ore": 399,
"ĠC": 400,
"Ġdid": 401,
"Ġnot": 402,
"ally": 403,
"Ġjust": 404,
"ee": 405,
"if": 406,
"ĠG": 407,
"Ġas": 408,
"Ġwor": 409,
"one": 410,
"ĠHe": 411,
"il": 412,
"Ġall": 413,
"Ġher": 414,
"ain": 415,
"ĠJ": 416,
"ĠO": 417,
"Ġab": 418,
"Ġsa": 419,
"Ġso": 420,
"Ġget": 421,
"ear": 422,
"op": 423,
"ĠP": 424,
"Ġout": 425,
"...": 426,
"ause": 427,
"uck": 428,
"est": 429,
"od": 430,
"ri": 431,
"ĠN": 432,
"Ġthem": 433,
"Ġhim": 434,
"ĠAnd": 435,
"Ġabout": 436,
"'m": 437,
"ab": 438,
"el": 439,
"ur": 440,
"us": 441,
"Ġat": 442,
"Ġthere": 443,
"Ġbr": 444,
"Ġhere": 445,
"ate": 446,
"Ġup": 447,
"Ġlike": 448,
"ct": 449,
"ive": 450,
"ound": 451,
"han": 452,
"Ġcon": 453,
"Ġne": 454,
"Ġus": 455,
"Ġone": 456,
"ak": 457,
"ig": 458,
"na": 459,
"rom": 460,
"un": 461,
"ure": 462,
"Ġor": 463,
"Ġtim": 464,
"Ġal": 465,
"ink": 466,
"Ġwould": 467,
"Ġwant": 468,
"Ġthey": 469,
"ong": 470,
"art": 471,
"Ġint": 472,
"Ġreally": 473,
"ook": 474,
"'ll": 475,
"are": 476,
"and": 477,
"ake": 478,
"Ġ\"": 479,
"Ġtell": 480,
"ought": 481,
"Ġwho": 482,
"Ġbut": 483,
"Ġtoo": 484,
"Ġfrom": 485,
"Ġgot": 486,
"Ġthing": 487,
"ĠTh": 488,
"ĠWhat": 489,
"Ġright": 490,
"very": 491,
"Ġany": 492,
"'ve": 493,
"em": 494,
"ge": 495,
"nt": 496,
"ost": 497,
"ud": 498,
"ĠF": 499
},
"merges": [
"Ġ t",
"h e",
"Ġ a",
"o u",
"i n",
"Ġ w",
"r e",
"Ġ s",
"Ġt he",
"o n",
"h a",
"Ġ y",
"Ġ I",
"Ġy ou",
"Ġ m",
"in g",
"Ġ b",
"Ġ c",
"i s",
"n d",
"l l",
"e r",
"Ġ d",
"o r",
"Ġt o",
"i t",
"Ġ f",
"e n",
"Ġ g",
"a n",
"s t",
"Ġ o",
"e d",
"Ġ p",
"ha t",
"Ġ l",
"Ġ n",
"Ġ he",
"a r",
"o w",
"Ġt h",
"o m",
"Ġ in",
"Ġa nd",
"Ġb e",
"Ġo f",
"v e",
"' s",
"Ġ ha",
"' t",
"e t",
"o t",
"Ġ T",
"a t",
"i d",
"a y",
"g h",
"s e",
"Ġ W",
"Ġw e",
"Ġ it",
"e s",
"Ġ h",
"Ġ A",
"gh t",
"Ġ Y",
"Ġ k",
"a l",
"i c",
"l d",
"i m",
"Ġ -",
"Ġ u",
"Ġ is",
"l e",
"Ġt hat",
"Ġs t",
"u t",
"Ġ S",
"Ġ on",
"Ġf or",
"ĠY ou",
"a s",
"Ġ B",
"Ġ e",
"Ġ H",
"ou t",
"a c",
"k e",
"l y",
"Ġm e",
". .",
"a ll",
"Ġ re",
"u st",
"Ġa re",
"i ght",
"Ġ r",
"Ġk n",
"i r",
"en t",
"v er",
"Ġ M",
"Ġth is",
"ĠT he",
"Ġkn ow",
"Ġg o",
"Ġha ve",
"e ll",
"i ll",
"Ġm y",
"Ġd o",
"e c",
"h ing",
"Ġ D",
"Ġl i",
"om e",
"' re",
"a m",
"a u",
"h o",
"r o",
"he r",
"ou ld",
"ĠI t",
"Ġyou r",
"Ġd on",
"u c",
"Ġa n",
"it h",
"i on",
"Ġw hat",
"Ġw as",
"a d",
"Ġ L",
"Ġ j",
"Ġw ith",
"Ġc an",
"an t",
"o o",
"o re",
"Ġ C",
"Ġd id",
"Ġn ot",
"all y",
"Ġj ust",
"e e",
"i f",
"Ġ G",
"Ġa s",
"Ġw or",
"on e",
"ĠH e",
"i l",
"Ġa ll",
"Ġhe r",
"a in",
"Ġ J",
"Ġ O",
"Ġa b",
"Ġs a",
"Ġs o",
"Ġg et",
"e ar",
"o p",
"Ġ P",
"Ġ out",
".. .",
"au se",
"uc k",
"e st",
"o d",
"r i",
"Ġ N",
"Ġthe m",
"Ġh im",
"ĠA nd",
"Ġab out",
"' m",
"a b",
"e l",
"u r",
"u s",
"Ġa t",
"Ġthe re",
"Ġb r",
"Ġhe re",
"at e",
"Ġu p",
"Ġli ke",
"c t",
"i ve",
"ou nd",
"ha n",
"Ġc on",
"Ġn e",
"Ġu s",
"Ġon e",
"a k",
"i g",
"n a",
"r om",
"u n",
"u re",
"Ġ or",
"Ġt im",
"Ġa l",
"in k",
"Ġw ould",
"Ġw ant",
"Ġthe y",
"on g",
"ar t",
"Ġin t",
"Ġre ally",
"oo k",
"' ll",
"a re",
"a nd",
"a ke",
"Ġ \"",
"Ġt ell",
"ou ght",
"Ġw ho",
"Ġb ut",
"Ġto o",
"Ġf rom",
"Ġg ot",
"Ġth ing",
"ĠT h",
"ĠW hat",
"Ġr ight",
"ver y",
"Ġan y",
"' ve",
"e m",
"g e",
"n t",
"o st",
"u d",
"Ġ F"
]
}
}