mineral-1b / tokenizer.json

Upload folder using huggingface_hub

edd4ead verified 3 months ago

6.76 kB

	{
	"version": "1.0",
	"truncation": null,
	"padding": null,
	"added_tokens": [
	{
	"id": 0,
	"content": "<pad>",
	"single_word": false,
	"lstrip": false,
	"rstrip": false,
	"normalized": false,
	"special": true
	},
	{
	"id": 1,
	"content": "<\|start\|>",
	"single_word": false,
	"lstrip": false,
	"rstrip": false,
	"normalized": false,
	"special": true
	},
	{
	"id": 2,
	"content": "<\|end\|>",
	"single_word": false,
	"lstrip": false,
	"rstrip": false,
	"normalized": false,
	"special": true
	},
	{
	"id": 3,
	"content": "<\|return\|>",
	"single_word": false,
	"lstrip": false,
	"rstrip": false,
	"normalized": false,
	"special": true
	},
	{
	"id": 4,
	"content": "<\|call\|>",
	"single_word": false,
	"lstrip": false,
	"rstrip": false,
	"normalized": false,
	"special": true
	},
	{
	"id": 5,
	"content": "<\|message\|>",
	"single_word": false,
	"lstrip": false,
	"rstrip": false,
	"normalized": false,
	"special": true
	},
	{
	"id": 6,
	"content": "<\|channel\|>",
	"single_word": false,
	"lstrip": false,
	"rstrip": false,
	"normalized": false,
	"special": true
	}
	],
	"normalizer": {
	"type": "Sequence",
	"normalizers": [
	{
	"type": "Prepend",
	"prepend": "▁"
	},
	{
	"type": "Replace",
	"pattern": {
	"String": " "
	},
	"content": "▁"
	}
	]
	},
	"pre_tokenizer": {
	"type": "Metaspace",
	"replacement": "▁",
	"add_prefix_space": true,
	"split": true
	},
	"post_processor": {
	"type": "TemplateProcessing",
	"single": [
	{
	"SpecialToken": {
	"id": "<\|start\|>",
	"type_id": 0
	}
	},
	{
	"Sequence": {
	"id": "A",
	"type_id": 0
	}
	}
	],
	"pair": [
	{
	"SpecialToken": {
	"id": "<\|start\|>",
	"type_id": 0
	}
	},
	{
	"Sequence": {
	"id": "A",
	"type_id": 0
	}
	},
	{
	"Sequence": {
	"id": "B",
	"type_id": 1
	}
	}
	],
	"special_tokens": {
	"<\|start\|>": {
	"id": "<\|start\|>",
	"ids": [1],
	"tokens": ["<\|start\|>"]
	}
	}
	},
	"decoder": {
	"type": "Metaspace",
	"replacement": "▁",
	"add_prefix_space": true,
	"split": true
	},
	"model": {
	"type": "BPE",
	"dropout": null,
	"unk_token": null,
	"continuing_subword_prefix": "",
	"end_of_word_suffix": "",
	"fuse_unk": false,
	"byte_fallback": true,
	"vocab": {
	"<pad>": 0,
	"<\|start\|>": 1,
	"<\|end\|>": 2,
	"<\|return\|>": 3,
	"<\|call\|>": 4,
	"<\|message\|>": 5,
	"<\|channel\|>": 6,
	"▁": 7,
	"▁the": 8,
	"▁a": 9,
	"▁to": 10,
	"▁of": 11,
	"▁and": 12,
	"▁in": 13,
	"▁is": 14,
	"▁for": 15,
	"▁that": 16,
	"▁on": 17,
	"▁with": 18,
	"▁as": 19,
	"▁it": 20,
	"▁at": 21,
	"▁by": 22,
	"▁from": 23,
	"▁or": 24,
	"▁an": 25,
	"▁this": 26,
	"▁be": 27,
	"▁are": 28,
	"▁was": 29,
	"▁not": 30,
	"▁you": 31,
	"▁have": 32,
	"▁can": 33,
	"▁will": 34,
	"▁we": 35,
	"▁but": 36,
	"▁all": 37,
	"▁they": 38,
	"▁has": 39,
	"▁one": 40,
	"▁more": 41,
	"▁would": 42,
	"▁if": 43,
	"▁there": 44,
	"▁their": 45,
	"▁which": 46,
	"▁about": 47,
	"▁when": 48,
	"▁than": 49,
	"▁these": 50,
	"▁some": 51,
	"▁time": 52,
	"▁into": 53,
	"▁just": 54,
	"▁its": 55,
	"▁do": 56,
	"▁out": 57,
	"▁them": 58,
	"▁up": 59,
	"▁may": 60,
	"▁what": 61,
	"▁been": 62,
	"▁like": 63,
	"▁other": 64,
	"▁so": 65,
	"▁how": 66,
	"▁who": 67,
	"▁two": 68,
	"▁my": 69,
	"▁use": 70,
	"▁get": 71,
	"▁she": 72,
	"▁also": 73,
	"▁because": 74,
	"▁then": 75,
	"▁now": 76,
	"▁first": 77,
	"▁only": 78,
	"▁make": 79,
	"▁know": 80,
	"▁people": 81,
	"▁said": 82,
	"▁where": 83,
	"▁very": 84,
	"▁over": 85,
	"▁such": 86,
	"▁see": 87,
	"▁him": 88,
	"▁way": 89,
	"▁many": 90,
	"▁most": 91,
	"▁could": 92,
	"▁should": 93,
	"▁after": 94,
	"▁well": 95,
	"▁your": 96,
	"▁through": 97,
	"▁back": 98,
	"▁any": 99,
	"▁our": 100
	},
	"merges": [
	"▁ t",
	"h e",
	"▁t he",
	"▁ a",
	"▁ to",
	"o f",
	"▁ of",
	"a n",
	"▁a nd",
	"i n",
	"▁ in",
	"i s",
	"▁ is",
	"▁ for",
	"t h",
	"▁th at",
	"o n",
	"▁ on",
	"w i",
	"▁wi th",
	"a s",
	"▁ as",
	"i t",
	"▁ it",
	"a t",
	"▁ at",
	"b y",
	"▁ by",
	"f r",
	"▁fr om",
	"o r",
	"▁ or",
	"▁a n",
	"▁th is",
	"b e",
	"▁ be",
	"a r",
	"▁ar e",
	"w a",
	"▁wa s",
	"n o",
	"▁no t",
	"y o",
	"▁yo u",
	"h a",
	"▁ha ve",
	"c a",
	"▁ca n",
	"w il",
	"▁wil l",
	"▁ we",
	"▁b ut",
	"al l",
	"▁ all",
	"th e",
	"▁the y",
	"▁ha s",
	"on e",
	"▁ one",
	"m or",
	"▁mor e",
	"w oul",
	"▁woul d",
	"▁ if",
	"th er",
	"▁ther e",
	"th ei",
	"▁thei r",
	"wh i",
	"▁whi ch",
	"ab ou",
	"▁abou t",
	"wh en",
	"▁ when",
	"th an",
	"▁ than",
	"th es",
	"▁thes e",
	"s om",
	"▁som e",
	"t im",
	"▁tim e",
	"in to",
	"▁ into",
	"j us",
	"▁jus t",
	"it s",
	"▁ its",
	"d o",
	"▁ do",
	"ou t",
	"▁ out",
	"th em",
	"▁ them",
	"u p",
	"▁ up"
	]
	}
	}