mineral-1b / tokenizer.json
prelington's picture
Upload folder using huggingface_hub
edd4ead verified
{
"version": "1.0",
"truncation": null,
"padding": null,
"added_tokens": [
{
"id": 0,
"content": "<pad>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 1,
"content": "<|start|>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 2,
"content": "<|end|>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 3,
"content": "<|return|>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 4,
"content": "<|call|>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 5,
"content": "<|message|>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 6,
"content": "<|channel|>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
}
],
"normalizer": {
"type": "Sequence",
"normalizers": [
{
"type": "Prepend",
"prepend": "▁"
},
{
"type": "Replace",
"pattern": {
"String": " "
},
"content": "▁"
}
]
},
"pre_tokenizer": {
"type": "Metaspace",
"replacement": "▁",
"add_prefix_space": true,
"split": true
},
"post_processor": {
"type": "TemplateProcessing",
"single": [
{
"SpecialToken": {
"id": "<|start|>",
"type_id": 0
}
},
{
"Sequence": {
"id": "A",
"type_id": 0
}
}
],
"pair": [
{
"SpecialToken": {
"id": "<|start|>",
"type_id": 0
}
},
{
"Sequence": {
"id": "A",
"type_id": 0
}
},
{
"Sequence": {
"id": "B",
"type_id": 1
}
}
],
"special_tokens": {
"<|start|>": {
"id": "<|start|>",
"ids": [1],
"tokens": ["<|start|>"]
}
}
},
"decoder": {
"type": "Metaspace",
"replacement": "▁",
"add_prefix_space": true,
"split": true
},
"model": {
"type": "BPE",
"dropout": null,
"unk_token": null,
"continuing_subword_prefix": "",
"end_of_word_suffix": "",
"fuse_unk": false,
"byte_fallback": true,
"vocab": {
"<pad>": 0,
"<|start|>": 1,
"<|end|>": 2,
"<|return|>": 3,
"<|call|>": 4,
"<|message|>": 5,
"<|channel|>": 6,
"▁": 7,
"▁the": 8,
"▁a": 9,
"▁to": 10,
"▁of": 11,
"▁and": 12,
"▁in": 13,
"▁is": 14,
"▁for": 15,
"▁that": 16,
"▁on": 17,
"▁with": 18,
"▁as": 19,
"▁it": 20,
"▁at": 21,
"▁by": 22,
"▁from": 23,
"▁or": 24,
"▁an": 25,
"▁this": 26,
"▁be": 27,
"▁are": 28,
"▁was": 29,
"▁not": 30,
"▁you": 31,
"▁have": 32,
"▁can": 33,
"▁will": 34,
"▁we": 35,
"▁but": 36,
"▁all": 37,
"▁they": 38,
"▁has": 39,
"▁one": 40,
"▁more": 41,
"▁would": 42,
"▁if": 43,
"▁there": 44,
"▁their": 45,
"▁which": 46,
"▁about": 47,
"▁when": 48,
"▁than": 49,
"▁these": 50,
"▁some": 51,
"▁time": 52,
"▁into": 53,
"▁just": 54,
"▁its": 55,
"▁do": 56,
"▁out": 57,
"▁them": 58,
"▁up": 59,
"▁may": 60,
"▁what": 61,
"▁been": 62,
"▁like": 63,
"▁other": 64,
"▁so": 65,
"▁how": 66,
"▁who": 67,
"▁two": 68,
"▁my": 69,
"▁use": 70,
"▁get": 71,
"▁she": 72,
"▁also": 73,
"▁because": 74,
"▁then": 75,
"▁now": 76,
"▁first": 77,
"▁only": 78,
"▁make": 79,
"▁know": 80,
"▁people": 81,
"▁said": 82,
"▁where": 83,
"▁very": 84,
"▁over": 85,
"▁such": 86,
"▁see": 87,
"▁him": 88,
"▁way": 89,
"▁many": 90,
"▁most": 91,
"▁could": 92,
"▁should": 93,
"▁after": 94,
"▁well": 95,
"▁your": 96,
"▁through": 97,
"▁back": 98,
"▁any": 99,
"▁our": 100
},
"merges": [
"▁ t",
"h e",
"▁t he",
"▁ a",
"▁ to",
"o f",
"▁ of",
"a n",
"▁a nd",
"i n",
"▁ in",
"i s",
"▁ is",
"▁ for",
"t h",
"▁th at",
"o n",
"▁ on",
"w i",
"▁wi th",
"a s",
"▁ as",
"i t",
"▁ it",
"a t",
"▁ at",
"b y",
"▁ by",
"f r",
"▁fr om",
"o r",
"▁ or",
"▁a n",
"▁th is",
"b e",
"▁ be",
"a r",
"▁ar e",
"w a",
"▁wa s",
"n o",
"▁no t",
"y o",
"▁yo u",
"h a",
"▁ha ve",
"c a",
"▁ca n",
"w il",
"▁wil l",
"▁ we",
"▁b ut",
"al l",
"▁ all",
"th e",
"▁the y",
"▁ha s",
"on e",
"▁ one",
"m or",
"▁mor e",
"w oul",
"▁woul d",
"▁ if",
"th er",
"▁ther e",
"th ei",
"▁thei r",
"wh i",
"▁whi ch",
"ab ou",
"▁abou t",
"wh en",
"▁ when",
"th an",
"▁ than",
"th es",
"▁thes e",
"s om",
"▁som e",
"t im",
"▁tim e",
"in to",
"▁ into",
"j us",
"▁jus t",
"it s",
"▁ its",
"d o",
"▁ do",
"ou t",
"▁ out",
"th em",
"▁ them",
"u p",
"▁ up"
]
}
}