tokenizer_config.json · dvitvaai/pothana-chat-300M at main

pothana-chat-300M / tokenizer_config.json

Upload folder using huggingface_hub

ebb32fd verified 2 months ago

1.29 kB

	{
	"tokenizer_class": "PreTrainedTokenizerFast",
	"auto_map": {
	"AutoTokenizer": [
	null,
	"tokenizer_class.TeluguTokenizer"
	]
	},
	"model_type": "llama",
	"bos_token": "<bos>",
	"eos_token": "<eos>",
	"unk_token": "<unk>",
	"pad_token": "<pad>",
	"add_bos_token": true,
	"add_eos_token": false,
	"clean_up_tokenization_spaces": false,
	"model_max_length": 2048,
	"extra_info": {
	"type": "morfessor_bpe_telugu",
	"separator": "@@",
	"note": "This tokenizer expects Morfessor-segmented text as input. For raw Telugu text, run Morfessor segmentation first using the included morfessor_telugu.bin model. Tokens ending with '@@' are continuation pieces that join to the next token. The decoder handles @@ removal automatically."
	},
	"additional_special_tokens": [
	"<\|system\|>",
	"<\|user\|>",
	"<\|assistant\|>",
	"<\|end\|>",
	"<bos>",
	"<eos>"
	],
	"chat_template": "{{ bos_token }}{% for message in messages %}{% if message['role'] == 'system' %}<\|system\|>{{ message['content'] }}<\|end\|>{% elif message['role'] == 'user' %}<\|user\|>{{ message['content'] }}<\|end\|>{% elif message['role'] == 'assistant' %}<\|assistant\|>{{ message['content'] }}<\|end\|>{% endif %}{% endfor %}{% if add_generation_prompt %}<\|assistant\|>{% endif %}"
	}