| { | |
| "tokenizer_class": "ByteLevelTokenizer", | |
| "tokenizer_type": "byte_level_with_offset", | |
| "vocab_size": 32000, | |
| "special_tokens": { | |
| "pad_token": {"id": 0, "content": "[PAD]"}, | |
| "eos_token": {"id": 1, "content": "[EOS]"}, | |
| "bos_token": {"id": 2, "content": "[BOS]"} | |
| }, | |
| "encoding_rule": "token_id = (byte_value % 256) + 3", | |
| "byte_range": "0-255 maps to token IDs 3-258", | |
| "max_length": 512, | |
| "note": "Simple byte-level tokenizer. Input text is converted to bytes, each byte value is offset by 3 to reserve IDs 0-2 for special tokens." | |
| } | |