pothana-base-300M / tokenizer_config.json
neshkatrapati's picture
Upload folder using huggingface_hub
715004d verified
{
"tokenizer_class": "PreTrainedTokenizerFast",
"auto_map": {
"AutoTokenizer": [
null,
"tokenizer_class.TeluguTokenizer"
]
},
"model_type": "llama",
"bos_token": "<bos>",
"eos_token": "<eos>",
"unk_token": "<unk>",
"pad_token": "<pad>",
"add_bos_token": true,
"add_eos_token": false,
"clean_up_tokenization_spaces": false,
"model_max_length": 2048,
"extra_info": {
"type": "morfessor_bpe_telugu",
"separator": "@@",
"note": "This tokenizer expects Morfessor-segmented text as input. For raw Telugu text, run Morfessor segmentation first using the included morfessor_telugu.bin model. Tokens ending with '@@' are continuation pieces that join to the next token. The decoder handles @@ removal automatically."
}
}