| { | |
| "tokenizer_class": "PreTrainedTokenizerFast", | |
| "auto_map": { | |
| "AutoTokenizer": [ | |
| null, | |
| "tokenizer_class.TeluguTokenizer" | |
| ] | |
| }, | |
| "model_type": "llama", | |
| "bos_token": "<bos>", | |
| "eos_token": "<eos>", | |
| "unk_token": "<unk>", | |
| "pad_token": "<pad>", | |
| "add_bos_token": true, | |
| "add_eos_token": false, | |
| "clean_up_tokenization_spaces": false, | |
| "model_max_length": 2048, | |
| "extra_info": { | |
| "type": "morfessor_bpe_telugu", | |
| "separator": "@@", | |
| "note": "This tokenizer expects Morfessor-segmented text as input. For raw Telugu text, run Morfessor segmentation first using the included morfessor_telugu.bin model. Tokens ending with '@@' are continuation pieces that join to the next token. The decoder handles @@ removal automatically." | |
| } | |
| } |