sabiyarn-32k / tokenizer_config.json
BeardedMonster's picture
Upload tokenizer
0558a07 verified
{
"add_prefix_space": false,
"added_tokens_decoder": {
"0": {
"content": "<unk>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"1": {
"content": "<s>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"2": {
"content": "</s>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"3": {
"content": "<pad>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"4": {
"content": "<ner>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"5": {
"content": "<response>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"6": {
"content": "<lang_ID>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"7": {
"content": "<correct>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"8": {
"content": "<iga>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"9": {
"content": "<efi>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"10": {
"content": "<eng>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"11": {
"content": "<egb>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"12": {
"content": "<prompt>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"13": {
"content": "<pcm>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"14": {
"content": "<title>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"15": {
"content": "<fuv>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"16": {
"content": "<ben>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"17": {
"content": "<tag>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"18": {
"content": "<classify>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"19": {
"content": "<ibo>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"20": {
"content": "<ff>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"21": {
"content": "<ijo>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"22": {
"content": "<tiv>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"23": {
"content": "<ibi>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"24": {
"content": "<diacritize>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"25": {
"content": "<summary>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"26": {
"content": "<headline>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"27": {
"content": "<topic>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"28": {
"content": "<lang_ID_label>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"29": {
"content": "<hau>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"30": {
"content": "<answer>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"31": {
"content": "<iso>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"32": {
"content": "|end_of_text|",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"33": {
"content": "<translate>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"34": {
"content": "<ila>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"35": {
"content": "<qa>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"36": {
"content": "<urh>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"37": {
"content": "<lang_id_label>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"38": {
"content": "<context>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"39": {
"content": "<clean>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"40": {
"content": "<sentiment>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"41": {
"content": "<NER>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"42": {
"content": "<summarize>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"43": {
"content": "<yor>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"44": {
"content": "<question>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"52000": {
"content": "<ful>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"52001": {
"content": "<|system|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"52002": {
"content": "<twi>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"52003": {
"content": "<fon>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"52004": {
"content": "<fat>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"52005": {
"content": "<ewe>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"52006": {
"content": "<ton>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"52007": {
"content": "<aka>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"52008": {
"content": "<toxic>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"52009": {
"content": "<STR>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"52010": {
"content": "<text>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"52011": {
"content": "<intent>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"52012": {
"content": "<think>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"52013": {
"content": "</think>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"52014": {
"content": "<reason>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"52015": {
"content": "<score>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"52016": {
"content": "<|user|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"52017": {
"content": "<identify>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"52018": {
"content": "<lang_id>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"52019": {
"content": "<|assistant|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"52020": {
"content": "<tool_call>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"52021": {
"content": "<|edit|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"52022": {
"content": "<|data_extract|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"52023": {
"content": "<|generate|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"52024": {
"content": "<|math|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"52025": {
"content": "<|JSON|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"52026": {
"content": "<|code|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"52027": {
"content": "<|is_technical|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"52028": {
"content": "<|input_lang|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"52029": {
"content": "<|target_lang|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"52030": {
"content": "</tool_call>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"52031": {
"content": "<|is_legal|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"52032": {
"content": "<|is_medical|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"52033": {
"content": "<|is_scientific|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"52034": {
"content": "<|is_non_technical|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"52035": {
"content": "<|is_financial|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"52036": {
"content": "<|RAG|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"52037": {
"content": "<tool_response>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"52038": {
"content": "</tool_response>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"52039": {
"content": "</context>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"52040": {
"content": "<task_plan>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"52041": {
"content": "</task_plan>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"52042": {
"content": "<|plan|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"52043": {
"content": "|analyze|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"52044": {
"content": "<|recommend|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"52045": {
"content": "<|explain|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"52046": {
"content": "<|debug|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"52047": {
"content": "<|chat|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"52048": {
"content": "<nli>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
}
},
"additional_special_tokens": [
"<|system|>",
"<twi>",
"<fon>",
"<fat>",
"<ewe>",
"<ton>",
"<aka>",
"<toxic>",
"<STR>",
"<text>",
"<intent>",
"<think>",
"</think>",
"<reason>",
"<score>",
"<|user|>",
"<identify>",
"<lang_id>",
"<lang_id_label>",
"<ner>",
"<NER>",
"<summary>",
"<summarize>",
"<|assistant|>",
"<context>",
"<clean>",
"<diacritize>",
"<lang_ID>",
"<lang_ID_label>",
"<title>",
"<headline>",
"<qa>",
"<tool_call>",
"<|edit|>",
"<|data_extract|>",
"<|generate|>",
"<|math|>",
"<|JSON|>",
"<|code|>",
"<|is_technical|>",
"<|input_lang|>",
"<|target_lang|>",
"</tool_call>",
"<|is_legal|>",
"<|is_medical|>",
"<|is_scientific|>",
"<|is_non_technical|>",
"<|is_financial|>",
"<|RAG|>",
"<tool_response>",
"</tool_response>",
"</context>",
"<task_plan>",
"</task_plan>",
"<|plan|>",
"|analyze|>",
"<|recommend|>",
"<|explain|>",
"<|debug|>",
"<|code|>",
"<|chat|>",
"<nli>"
],
"bos_token": "<s>",
"chat_template": "{%- for message in messages %}\n {%- if loop.index0 == 0 %}\n {{- bos_token }}\n {%- endif %}\n\n {# ===== SYSTEM MESSAGES ===== #}\n {%- if message['role'] == 'system' %}\n {{- \"<|system|>\" }}\n {{- message['content'] }}\n {{- eos_token }}\n\n {# ===== USER MESSAGES ===== #}\n {%- elif message['role'] == 'user' %}\n {{- \"<|user|>\" }}\n {{- message['content'] }}\n {{- eos_token }}\n\n {# ===== TOOL RESPONSES ===== #}\n {%- elif message['role'] == 'tool' %}\n {{- \"<tool_result>\" }}\n {{- message['name'] ~ \" \" if message['name'] else \"\" }}\n {{- message['content'] }}\n {{- \"</tool_result>\" }}\n {{- eos_token }}\n\n {# ===== ASSISTANT MESSAGES — NORMAL OR WITH TOOL CALL ===== #}\n {%- elif message['role'] == 'assistant' %}\n {{- \"<|assistant|>\" }}\n\n {# CASE 1 — assistant text only #}\n {%- if 'tool_calls' not in message %}\n {{- message['content'] }}\n {{- eos_token }}\n\n {# CASE 2 — assistant performs tool call #}\n {%- else %}\n {%- for tool_call in message['tool_calls'] %}\n {{- \"<tool_call>\" }}\n {{- tool_call['function']['name'] }}\n {{- \"\\n\" }}\n {{- tool_call['function']['arguments'] | tojson }}\n {{- \"</tool_call>\" }}\n {{- eos_token }}\n {%- endfor %}\n {%- endif %}\n\n {%- endif %}\n{%- endfor %}\n\n{%- if add_generation_prompt %}\n {{- \"<|assistant|>\" }}\n{%- endif %}",
"clean_up_tokenization_spaces": false,
"eos_token": "</s>",
"model_max_length": 1000000000000000019884624838656,
"pad_token": "<pad>",
"padding_side": "left",
"tokenizer_class": "BloomTokenizer",
"unk_token": "<unk>"
}