char128_shift_tokenizer / tokenizer.json

Update tokenizer.json

c1010b0 verified 3 months ago

7.01 kB

	{
	"version": "1.0",
	"truncation": null,
	"padding": null,
	"added_tokens": [
	{
	"id": 0,
	"content": "<pad>",
	"single_word": false,
	"lstrip": false,
	"rstrip": false,
	"normalized": false,
	"special": true
	},
	{
	"id": 1,
	"content": "<unk>",
	"single_word": false,
	"lstrip": false,
	"rstrip": false,
	"normalized": false,
	"special": true
	},
	{
	"id": 2,
	"content": "<bos>",
	"single_word": false,
	"lstrip": false,
	"rstrip": false,
	"normalized": false,
	"special": true
	},
	{
	"id": 3,
	"content": "<eos>",
	"single_word": false,
	"lstrip": false,
	"rstrip": false,
	"normalized": false,
	"special": true
	}
	],
	"normalizer": {
	"type": "Sequence",
	"normalizers": [
	{
	"type": "NFKC"
	},
	{
	"type": "Replace",
	"pattern": {
	"Regex": "A"
	},
	"content": "↨a"
	},
	{
	"type": "Replace",
	"pattern": {
	"Regex": "B"
	},
	"content": "↨b"
	},
	{
	"type": "Replace",
	"pattern": {
	"Regex": "C"
	},
	"content": "↨c"
	},
	{
	"type": "Replace",
	"pattern": {
	"Regex": "D"
	},
	"content": "↨d"
	},
	{
	"type": "Replace",
	"pattern": {
	"Regex": "E"
	},
	"content": "↨e"
	},
	{
	"type": "Replace",
	"pattern": {
	"Regex": "F"
	},
	"content": "↨f"
	},
	{
	"type": "Replace",
	"pattern": {
	"Regex": "G"
	},
	"content": "↨g"
	},
	{
	"type": "Replace",
	"pattern": {
	"Regex": "H"
	},
	"content": "↨h"
	},
	{
	"type": "Replace",
	"pattern": {
	"Regex": "I"
	},
	"content": "↨i"
	},
	{
	"type": "Replace",
	"pattern": {
	"Regex": "J"
	},
	"content": "↨j"
	},
	{
	"type": "Replace",
	"pattern": {
	"Regex": "K"
	},
	"content": "↨k"
	},
	{
	"type": "Replace",
	"pattern": {
	"Regex": "L"
	},
	"content": "↨l"
	},
	{
	"type": "Replace",
	"pattern": {
	"Regex": "M"
	},
	"content": "↨m"
	},
	{
	"type": "Replace",
	"pattern": {
	"Regex": "N"
	},
	"content": "↨n"
	},
	{
	"type": "Replace",
	"pattern": {
	"Regex": "O"
	},
	"content": "↨o"
	},
	{
	"type": "Replace",
	"pattern": {
	"Regex": "P"
	},
	"content": "↨p"
	},
	{
	"type": "Replace",
	"pattern": {
	"Regex": "Q"
	},
	"content": "↨q"
	},
	{
	"type": "Replace",
	"pattern": {
	"Regex": "R"
	},
	"content": "↨r"
	},
	{
	"type": "Replace",
	"pattern": {
	"Regex": "S"
	},
	"content": "↨s"
	},
	{
	"type": "Replace",
	"pattern": {
	"Regex": "T"
	},
	"content": "↨t"
	},
	{
	"type": "Replace",
	"pattern": {
	"Regex": "U"
	},
	"content": "↨u"
	},
	{
	"type": "Replace",
	"pattern": {
	"Regex": "V"
	},
	"content": "↨v"
	},
	{
	"type": "Replace",
	"pattern": {
	"Regex": "W"
	},
	"content": "↨w"
	},
	{
	"type": "Replace",
	"pattern": {
	"Regex": "X"
	},
	"content": "↨x"
	},
	{
	"type": "Replace",
	"pattern": {
	"Regex": "Y"
	},
	"content": "↨y"
	},
	{
	"type": "Replace",
	"pattern": {
	"Regex": "Z"
	},
	"content": "↨z"
	}
	]
	},
	"pre_tokenizer": {
	"type": "Split",
	"pattern": {
	"Regex": "\\X"
	},
	"behavior": "Isolated",
	"invert": false
	},
	"post_processor": {
	"type": "TemplateProcessing",
	"single": [
	{
	"Sequence": {
	"id": "A",
	"type_id": 0
	}
	}
	],
	"pair": [
	{
	"Sequence": {
	"id": "A",
	"type_id": 0
	}
	},
	{
	"Sequence": {
	"id": "B",
	"type_id": 0
	}
	}
	],
	"special_tokens": {}
	},
	"decoder": {
	"type": "Sequence",
	"decoders": []
	},
	"model": {
	"type": "WordLevel",
	"vocab": {
	"<pad>": 0,
	"<unk>": 1,
	"<bos>": 2,
	"<eos>": 3,
	"↨": 4,
	"\n": 5,
	"\t": 6,
	" ": 7,
	"0": 8,
	"1": 9,
	"2": 10,
	"3": 11,
	"4": 12,
	"5": 13,
	"6": 14,
	"7": 15,
	"8": 16,
	"9": 17,
	"a": 18,
	"b": 19,
	"c": 20,
	"d": 21,
	"e": 22,
	"f": 23,
	"g": 24,
	"h": 25,
	"i": 26,
	"j": 27,
	"k": 28,
	"l": 29,
	"m": 30,
	"n": 31,
	"o": 32,
	"p": 33,
	"q": 34,
	"r": 35,
	"s": 36,
	"t": 37,
	"u": 38,
	"v": 39,
	"w": 40,
	"x": 41,
	"y": 42,
	"z": 43,
	"\"": 44,
	"!": 45,
	"$": 46,
	"&": 47,
	"'": 48,
	"#": 49,
	",": 50,
	"/": 51,
	"+": 52,
	"=": 53,
	"-": 54,
	"<": 55,
	">": 56,
	"*": 57,
	"@": 58,
	".": 59,
	":": 60,
	";": 61,
	"[": 62,
	"]": 63,
	"{": 64,
	"}": 65,
	"(": 66,
	")": 67,
	"^": 68,
	"_": 69,
	"?": 70,
	"%": 71,
	"é": 72,
	"¤69": 73,
	"¤70": 74,
	"¤71": 75,
	"¤72": 76,
	"¤73": 77,
	"¤74": 78,
	"¤75": 79,
	"¤76": 80,
	"¤77": 81,
	"¤78": 82,
	"¤79": 83,
	"¤80": 84,
	"¤81": 85,
	"¤82": 86,
	"¤83": 87,
	"¤84": 88,
	"¤85": 89,
	"¤86": 90,
	"¤87": 91,
	"¤88": 92,
	"¤89": 93,
	"¤90": 94,
	"¤91": 95,
	"¤92": 96,
	"¤93": 97,
	"¤94": 98,
	"¤95": 99,
	"¤96": 100,
	"¤97": 101,
	"¤98": 102,
	"¤99": 103,
	"¤100": 104,
	"¤101": 105,
	"¤102": 106,
	"¤103": 107,
	"¤104": 108,
	"¤105": 109,
	"¤106": 110,
	"¤107": 111,
	"¤108": 112,
	"¤109": 113,
	"¤110": 114,
	"¤111": 115,
	"¤112": 116,
	"¤113": 117,
	"¤114": 118,
	"¤115": 119,
	"¤116": 120,
	"¤117": 121,
	"¤118": 122,
	"¤119": 123,
	"¤120": 124,
	"¤121": 125,
	"¤122": 126,
	"¤123": 127
	},
	"unk_token": "<unk>"
	}
	}