rx-codex-v1-from-scratch / rx_codex_v1_tokenizer.json
rxmha125's picture
Initial commit of Rx Codex v1 (approx 25M params with small vocab) from scratch - Phase 1 Complete
5d1d1d8 verified
{
"version": "1.0",
"truncation": null,
"padding": null,
"added_tokens": [
{
"id": 0,
"content": "[UNK]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 1,
"content": "[PAD]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 2,
"content": "[BOS]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 3,
"content": "[EOS]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
}
],
"normalizer": {
"type": "Sequence",
"normalizers": [
{
"type": "NFC"
},
{
"type": "Lowercase"
}
]
},
"pre_tokenizer": {
"type": "ByteLevel",
"add_prefix_space": true,
"trim_offsets": true,
"use_regex": true
},
"post_processor": null,
"decoder": {
"type": "ByteLevel",
"add_prefix_space": true,
"trim_offsets": true,
"use_regex": true
},
"model": {
"type": "BPE",
"dropout": null,
"unk_token": "[UNK]",
"continuing_subword_prefix": null,
"end_of_word_suffix": null,
"fuse_unk": false,
"byte_fallback": false,
"ignore_merges": false,
"vocab": {
"[UNK]": 0,
"[PAD]": 1,
"[BOS]": 2,
"[EOS]": 3,
".": 4,
"0": 5,
"1": 6,
"2": 7,
"3": 8,
"4": 9,
"5": 10,
"6": 11,
"7": 12,
"8": 13,
"9": 14,
"a": 15,
"c": 16,
"d": 17,
"e": 18,
"f": 19,
"g": 20,
"h": 21,
"i": 22,
"j": 23,
"k": 24,
"l": 25,
"m": 26,
"n": 27,
"o": 28,
"p": 29,
"r": 30,
"s": 31,
"t": 32,
"u": 33,
"v": 34,
"x": 35,
"y": 36,
"z": 37,
"Ġ": 38,
"Ġt": 39,
"Ġf": 40,
"in": 41,
"en": 42,
"is": 43,
"er": 44,
"Ġto": 45,
"Ġa": 46,
"ain": 47,
"iz": 48,
"ken": 49,
"le": 50,
"rain": 51,
"Ġth": 52,
"Ġtrain": 53,
"Ġtoken": 54,
"izer": 55,
"Ġtokenizer": 56,
"or": 57,
"du": 58,
"ile": 59,
"mm": 60,
"om": 61,
"rom": 62,
"Ġis": 63,
"Ġdu": 64,
"Ġfor": 65,
"Ġfile": 66,
"Ġfrom": 67,
"ing": 68,
"Ġthis": 69,
"Ġtraining": 70,
"mmy": 71,
"Ġdummy": 72,
"on": 73,
"Ġs": 74,
"ce": 75,
"ten": 76,
"enten": 77,
"Ġsenten": 78,
"Ġsentence": 79,
"cs": 80,
"js": 81,
"lin": 82,
"Ġcs": 83,
"Ġjs": 84,
"Ġlin": 85,
"onl": 86,
"Ġcsv": 87,
"Ġjsonl": 88,
"Ġline": 89,
"Ġ1": 90,
"Ġ2": 91,
"Ġ3": 92,
"Ġ4": 93,
"her": 94,
"no": 95,
"ther": 96,
"Ġon": 97,
"nother": 98,
"Ġone": 99,
"another": 100,
"ex": 101,
"Ġanother": 102,
"am": 103,
"et": 104,
"ple": 105,
"yet": 106,
"Ġex": 107,
"Ġthe": 108,
"ample": 109,
"Ġexample": 110,
"Ġ0": 111,
"Ġ5": 112,
"Ġ6": 113,
"Ġ7": 114,
"Ġ8": 115,
"Ġ9": 116,
"Ġ10": 117,
"Ġ11": 118,
"Ġ12": 119,
"Ġ13": 120,
"Ġ14": 121,
"Ġ15": 122,
"Ġ16": 123,
"Ġ17": 124,
"Ġ18": 125,
"Ġ19": 126,
"Ġ20": 127,
"Ġ21": 128,
"Ġ22": 129,
"Ġ23": 130,
"Ġ24": 131,
"Ġ25": 132,
"Ġ26": 133,
"Ġ27": 134,
"Ġ28": 135,
"Ġ29": 136,
"Ġ30": 137,
"Ġ31": 138,
"Ġ32": 139,
"Ġ33": 140,
"Ġ34": 141,
"Ġ35": 142,
"Ġ36": 143,
"Ġ37": 144,
"Ġ38": 145,
"Ġ39": 146,
"Ġ40": 147,
"Ġ41": 148,
"Ġ42": 149,
"Ġ43": 150,
"Ġ44": 151,
"Ġ45": 152,
"Ġ46": 153,
"Ġ47": 154,
"Ġ48": 155,
"Ġ49": 156,
"hor": 157,
"Ġyet": 158,
"Ġtex": 159,
"Ġshor": 160,
"Ġtext": 161,
"Ġshort": 162
},
"merges": [
[
"Ġ",
"t"
],
[
"Ġ",
"f"
],
[
"i",
"n"
],
[
"e",
"n"
],
[
"i",
"s"
],
[
"e",
"r"
],
[
"Ġt",
"o"
],
[
"Ġ",
"a"
],
[
"a",
"in"
],
[
"i",
"z"
],
[
"k",
"en"
],
[
"l",
"e"
],
[
"r",
"ain"
],
[
"Ġt",
"h"
],
[
"Ġt",
"rain"
],
[
"Ġto",
"ken"
],
[
"iz",
"er"
],
[
"Ġtoken",
"izer"
],
[
"o",
"r"
],
[
"d",
"u"
],
[
"i",
"le"
],
[
"m",
"m"
],
[
"o",
"m"
],
[
"r",
"om"
],
[
"Ġ",
"is"
],
[
"Ġ",
"du"
],
[
"Ġf",
"or"
],
[
"Ġf",
"ile"
],
[
"Ġf",
"rom"
],
[
"in",
"g"
],
[
"Ġth",
"is"
],
[
"Ġtrain",
"ing"
],
[
"mm",
"y"
],
[
"Ġdu",
"mmy"
],
[
"o",
"n"
],
[
"Ġ",
"s"
],
[
"c",
"e"
],
[
"t",
"en"
],
[
"en",
"ten"
],
[
"Ġs",
"enten"
],
[
"Ġsenten",
"ce"
],
[
"c",
"s"
],
[
"j",
"s"
],
[
"l",
"in"
],
[
"Ġ",
"cs"
],
[
"Ġ",
"js"
],
[
"Ġ",
"lin"
],
[
"on",
"l"
],
[
"Ġcs",
"v"
],
[
"Ġjs",
"onl"
],
[
"Ġlin",
"e"
],
[
"Ġ",
"1"
],
[
"Ġ",
"2"
],
[
"Ġ",
"3"
],
[
"Ġ",
"4"
],
[
"h",
"er"
],
[
"n",
"o"
],
[
"t",
"her"
],
[
"Ġ",
"on"
],
[
"no",
"ther"
],
[
"Ġon",
"e"
],
[
"a",
"nother"
],
[
"e",
"x"
],
[
"Ġa",
"nother"
],
[
"a",
"m"
],
[
"e",
"t"
],
[
"p",
"le"
],
[
"y",
"et"
],
[
"Ġ",
"ex"
],
[
"Ġth",
"e"
],
[
"am",
"ple"
],
[
"Ġex",
"ample"
],
[
"Ġ",
"0"
],
[
"Ġ",
"5"
],
[
"Ġ",
"6"
],
[
"Ġ",
"7"
],
[
"Ġ",
"8"
],
[
"Ġ",
"9"
],
[
"Ġ1",
"0"
],
[
"Ġ1",
"1"
],
[
"Ġ1",
"2"
],
[
"Ġ1",
"3"
],
[
"Ġ1",
"4"
],
[
"Ġ1",
"5"
],
[
"Ġ1",
"6"
],
[
"Ġ1",
"7"
],
[
"Ġ1",
"8"
],
[
"Ġ1",
"9"
],
[
"Ġ2",
"0"
],
[
"Ġ2",
"1"
],
[
"Ġ2",
"2"
],
[
"Ġ2",
"3"
],
[
"Ġ2",
"4"
],
[
"Ġ2",
"5"
],
[
"Ġ2",
"6"
],
[
"Ġ2",
"7"
],
[
"Ġ2",
"8"
],
[
"Ġ2",
"9"
],
[
"Ġ3",
"0"
],
[
"Ġ3",
"1"
],
[
"Ġ3",
"2"
],
[
"Ġ3",
"3"
],
[
"Ġ3",
"4"
],
[
"Ġ3",
"5"
],
[
"Ġ3",
"6"
],
[
"Ġ3",
"7"
],
[
"Ġ3",
"8"
],
[
"Ġ3",
"9"
],
[
"Ġ4",
"0"
],
[
"Ġ4",
"1"
],
[
"Ġ4",
"2"
],
[
"Ġ4",
"3"
],
[
"Ġ4",
"4"
],
[
"Ġ4",
"5"
],
[
"Ġ4",
"6"
],
[
"Ġ4",
"7"
],
[
"Ġ4",
"8"
],
[
"Ġ4",
"9"
],
[
"h",
"or"
],
[
"Ġ",
"yet"
],
[
"Ġt",
"ex"
],
[
"Ġs",
"hor"
],
[
"Ġtex",
"t"
],
[
"Ġshor",
"t"
]
]
}
}