File size: 557 Bytes
3cea44b
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
{
  "tokenizer_class": "ByteLevelTokenizer",
  "tokenizer_type": "byte_level_with_offset",
  "vocab_size": 32000,
  "special_tokens": {
    "pad_token": {"id": 0, "content": "[PAD]"},
    "eos_token": {"id": 1, "content": "[EOS]"},
    "bos_token": {"id": 2, "content": "[BOS]"}
  },
  "encoding_rule": "token_id = (byte_value % 256) + 3",
  "byte_range": "0-255 maps to token IDs 3-258",
  "max_length": 512,
  "note": "Simple byte-level tokenizer. Input text is converted to bytes, each byte value is offset by 3 to reserve IDs 0-2 for special tokens."
}