mingru-shakespeare / tokenizer.json
jogonba2's picture
Upload tokenizer
c9f5f6f verified
{
"version": "1.0",
"truncation": null,
"padding": null,
"added_tokens": [
{
"id": 0,
"content": "<|endoftext|>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
}
],
"normalizer": null,
"pre_tokenizer": {
"type": "ByteLevel",
"add_prefix_space": false,
"trim_offsets": true,
"use_regex": true
},
"post_processor": {
"type": "ByteLevel",
"add_prefix_space": true,
"trim_offsets": false,
"use_regex": true
},
"decoder": {
"type": "ByteLevel",
"add_prefix_space": true,
"trim_offsets": true,
"use_regex": true
},
"model": {
"type": "BPE",
"dropout": null,
"unk_token": null,
"continuing_subword_prefix": "",
"end_of_word_suffix": "",
"fuse_unk": false,
"byte_fallback": false,
"ignore_merges": false,
"vocab": {
"<|endoftext|>": 0,
"!": 1,
"$": 2,
"&": 3,
"'": 4,
",": 5,
"-": 6,
".": 7,
"3": 8,
":": 9,
";": 10,
"?": 11,
"A": 12,
"B": 13,
"C": 14,
"D": 15,
"E": 16,
"F": 17,
"G": 18,
"H": 19,
"I": 20,
"J": 21,
"K": 22,
"L": 23,
"M": 24,
"N": 25,
"O": 26,
"P": 27,
"Q": 28,
"R": 29,
"S": 30,
"T": 31,
"U": 32,
"V": 33,
"W": 34,
"X": 35,
"Y": 36,
"Z": 37,
"a": 38,
"b": 39,
"c": 40,
"d": 41,
"e": 42,
"f": 43,
"g": 44,
"h": 45,
"i": 46,
"j": 47,
"k": 48,
"l": 49,
"m": 50,
"n": 51,
"o": 52,
"p": 53,
"q": 54,
"r": 55,
"s": 56,
"t": 57,
"u": 58,
"v": 59,
"w": 60,
"x": 61,
"y": 62,
"z": 63,
"Ċ": 64,
"Ġ": 65,
"Ġt": 66,
"he": 67,
"Ġa": 68,
"ou": 69,
"Ġs": 70,
"Ġm": 71,
"in": 72,
"Ġw": 73,
"re": 74,
"ha": 75,
"Ġthe": 76,
"nd": 77,
"Ġb": 78,
"is": 79,
"or": 80,
"Ġf": 81,
"er": 82,
"ll": 83,
"it": 84,
"on": 85,
"Ġd": 86,
"Ġc": 87,
"es": 88,
"Ġl": 89,
"en": 90,
"Ġn": 91,
"Ġy": 92,
"ar": 93,
"Ġth": 94,
"Ġh": 95,
"Ġo": 96,
"Ġto": 97,
"Ġp": 98,
"Ġyou": 99,
"hat": 100,
"ĠI": 101,
"Ġhe": 102,
"ot": 103,
"ve": 104,
"ing": 105,
"Ġof": 106,
"st": 107,
"Ġand": 108,
"ow": 109,
"an": 110,
"om": 111,
"Ġg": 112,
"at": 113,
"Ġbe": 114,
"se": 115,
"ce": 116,
"Ġmy": 117,
"Ġin": 118,
"Ġha": 119,
"le": 120,
"ay": 121,
"ld": 122,
"et": 123,
"ir": 124,
"ed": 125,
"ut": 126,
"im": 127,
"ith": 128,
"'s": 129,
"Ġme": 130,
"Ġnot": 131,
"Ġthat": 132,
"ch": 133,
"gh": 134,
"our": 135,
"Ġis": 136,
"And": 137,
"Ġu": 138,
"Ġfor": 139,
"ke": 140,
"Ġwe": 141,
"oo": 142,
"ill": 143,
"Ġe": 144,
"her": 145,
"Ġwith": 146,
"Ġyour": 147,
"Ġit": 148,
"ad": 149,
"ent": 150,
"ri": 151,
"Ġst": 152,
"Ġthou": 153,
"Ġhis": 154,
"'d": 155,
"Ġk": 156,
"ord": 157,
"ome": 158,
"EN": 159,
"ght": 160,
"ra": 161,
"The": 162,
"Ġre": 163,
"IN": 164,
"Ġhim": 165,
"ly": 166,
"Ġli": 167,
"Ġhave": 168,
"id": 169,
"as": 170,
"ur": 171,
"al": 172,
"Ġthis": 173,
"Ġde": 174,
"Ġso": 175,
"Ġon": 176,
"Ġas": 177,
"AR": 178,
"ro": 179,
"ore": 180,
"hi": 181,
"ould": 182,
"ood": 183,
"ck": 184,
"ver": 185,
"ain": 186,
"est": 187,
"ess": 188,
"Ġthy": 189,
"Ġsha": 190,
"US": 191,
"Ġdo": 192,
"ea": 193,
"Ġwill": 194,
"Ġno": 195,
"am": 196,
"us": 197,
"Ġbut": 198,
"ge": 199,
"and": 200,
"Ġse": 201,
"if": 202,
"IO": 203,
"Th": 204,
"ion": 205,
"To": 206,
"Ġall": 207,
"Ġsu": 208,
"ake": 209,
"th": 210,
"ear": 211,
"ue": 212,
"Ġan": 213,
"ter": 214,
"Ġlo": 215,
"ard": 216,
"ING": 217,
"han": 218,
"Ġour": 219,
"Ġher": 220,
"Ġby": 221,
"Ġsp": 222,
"Ġfa": 223,
"ell": 224,
"ĠR": 225,
"Ġshall": 226,
"ĠC": 227,
"Ġthee": 228,
"rom": 229,
"ho": 230,
"il": 231,
"ES": 232,
"ct": 233,
"ous": 234,
"OR": 235,
"ust": 236,
"Ġv": 237,
"Ġne": 238,
"Ġare": 239,
"That": 240,
"ul": 241,
"Ġkn": 242,
"ight": 243,
"ER": 244,
"Ġwhat": 245,
"Ġlord": 246,
"Ġsh": 247,
"ast": 248,
"ath": 249,
"sel": 250,
"Ġup": 251,
"art": 252,
"ĠE": 253,
"LA": 254,
"KING": 255
},
"merges": [
[
"Ġ",
"t"
],
[
"h",
"e"
],
[
"Ġ",
"a"
],
[
"o",
"u"
],
[
"Ġ",
"s"
],
[
"Ġ",
"m"
],
[
"i",
"n"
],
[
"Ġ",
"w"
],
[
"r",
"e"
],
[
"h",
"a"
],
[
"Ġt",
"he"
],
[
"n",
"d"
],
[
"Ġ",
"b"
],
[
"i",
"s"
],
[
"o",
"r"
],
[
"Ġ",
"f"
],
[
"e",
"r"
],
[
"l",
"l"
],
[
"i",
"t"
],
[
"o",
"n"
],
[
"Ġ",
"d"
],
[
"Ġ",
"c"
],
[
"e",
"s"
],
[
"Ġ",
"l"
],
[
"e",
"n"
],
[
"Ġ",
"n"
],
[
"Ġ",
"y"
],
[
"a",
"r"
],
[
"Ġt",
"h"
],
[
"Ġ",
"h"
],
[
"Ġ",
"o"
],
[
"Ġt",
"o"
],
[
"Ġ",
"p"
],
[
"Ġy",
"ou"
],
[
"ha",
"t"
],
[
"Ġ",
"I"
],
[
"Ġ",
"he"
],
[
"o",
"t"
],
[
"v",
"e"
],
[
"in",
"g"
],
[
"Ġo",
"f"
],
[
"s",
"t"
],
[
"Ġa",
"nd"
],
[
"o",
"w"
],
[
"a",
"n"
],
[
"o",
"m"
],
[
"Ġ",
"g"
],
[
"a",
"t"
],
[
"Ġb",
"e"
],
[
"s",
"e"
],
[
"c",
"e"
],
[
"Ġm",
"y"
],
[
"Ġ",
"in"
],
[
"Ġ",
"ha"
],
[
"l",
"e"
],
[
"a",
"y"
],
[
"l",
"d"
],
[
"e",
"t"
],
[
"i",
"r"
],
[
"e",
"d"
],
[
"u",
"t"
],
[
"i",
"m"
],
[
"it",
"h"
],
[
"'",
"s"
],
[
"Ġm",
"e"
],
[
"Ġn",
"ot"
],
[
"Ġt",
"hat"
],
[
"c",
"h"
],
[
"g",
"h"
],
[
"ou",
"r"
],
[
"Ġ",
"is"
],
[
"A",
"nd"
],
[
"Ġ",
"u"
],
[
"Ġf",
"or"
],
[
"k",
"e"
],
[
"Ġw",
"e"
],
[
"o",
"o"
],
[
"i",
"ll"
],
[
"Ġ",
"e"
],
[
"he",
"r"
],
[
"Ġw",
"ith"
],
[
"Ġyou",
"r"
],
[
"Ġ",
"it"
],
[
"a",
"d"
],
[
"en",
"t"
],
[
"r",
"i"
],
[
"Ġs",
"t"
],
[
"Ġth",
"ou"
],
[
"Ġh",
"is"
],
[
"'",
"d"
],
[
"Ġ",
"k"
],
[
"or",
"d"
],
[
"om",
"e"
],
[
"E",
"N"
],
[
"gh",
"t"
],
[
"r",
"a"
],
[
"T",
"he"
],
[
"Ġ",
"re"
],
[
"I",
"N"
],
[
"Ġh",
"im"
],
[
"l",
"y"
],
[
"Ġl",
"i"
],
[
"Ġha",
"ve"
],
[
"i",
"d"
],
[
"a",
"s"
],
[
"u",
"r"
],
[
"a",
"l"
],
[
"Ġth",
"is"
],
[
"Ġd",
"e"
],
[
"Ġs",
"o"
],
[
"Ġ",
"on"
],
[
"Ġa",
"s"
],
[
"A",
"R"
],
[
"r",
"o"
],
[
"o",
"re"
],
[
"h",
"i"
],
[
"ou",
"ld"
],
[
"oo",
"d"
],
[
"c",
"k"
],
[
"v",
"er"
],
[
"a",
"in"
],
[
"es",
"t"
],
[
"es",
"s"
],
[
"Ġth",
"y"
],
[
"Ġs",
"ha"
],
[
"U",
"S"
],
[
"Ġd",
"o"
],
[
"e",
"a"
],
[
"Ġw",
"ill"
],
[
"Ġn",
"o"
],
[
"a",
"m"
],
[
"u",
"s"
],
[
"Ġb",
"ut"
],
[
"g",
"e"
],
[
"a",
"nd"
],
[
"Ġs",
"e"
],
[
"i",
"f"
],
[
"I",
"O"
],
[
"T",
"h"
],
[
"i",
"on"
],
[
"T",
"o"
],
[
"Ġa",
"ll"
],
[
"Ġs",
"u"
],
[
"a",
"ke"
],
[
"t",
"h"
],
[
"e",
"ar"
],
[
"u",
"e"
],
[
"Ġa",
"n"
],
[
"t",
"er"
],
[
"Ġl",
"o"
],
[
"ar",
"d"
],
[
"IN",
"G"
],
[
"ha",
"n"
],
[
"Ġ",
"our"
],
[
"Ġhe",
"r"
],
[
"Ġb",
"y"
],
[
"Ġs",
"p"
],
[
"Ġf",
"a"
],
[
"e",
"ll"
],
[
"Ġ",
"R"
],
[
"Ġsha",
"ll"
],
[
"Ġ",
"C"
],
[
"Ġthe",
"e"
],
[
"r",
"om"
],
[
"h",
"o"
],
[
"i",
"l"
],
[
"E",
"S"
],
[
"c",
"t"
],
[
"ou",
"s"
],
[
"O",
"R"
],
[
"u",
"st"
],
[
"Ġ",
"v"
],
[
"Ġn",
"e"
],
[
"Ġa",
"re"
],
[
"T",
"hat"
],
[
"u",
"l"
],
[
"Ġk",
"n"
],
[
"i",
"ght"
],
[
"E",
"R"
],
[
"Ġw",
"hat"
],
[
"Ġl",
"ord"
],
[
"Ġs",
"h"
],
[
"a",
"st"
],
[
"at",
"h"
],
[
"se",
"l"
],
[
"Ġu",
"p"
],
[
"ar",
"t"
],
[
"Ġ",
"E"
],
[
"L",
"A"
],
[
"K",
"ING"
]
]
}
}