icxcn commited on
Commit
3cea44b
·
verified ·
1 Parent(s): 93145cf

Upload tokenizer_config.json with huggingface_hub

Browse files
Files changed (1) hide show
  1. tokenizer_config.json +14 -0
tokenizer_config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "tokenizer_class": "ByteLevelTokenizer",
3
+ "tokenizer_type": "byte_level_with_offset",
4
+ "vocab_size": 32000,
5
+ "special_tokens": {
6
+ "pad_token": {"id": 0, "content": "[PAD]"},
7
+ "eos_token": {"id": 1, "content": "[EOS]"},
8
+ "bos_token": {"id": 2, "content": "[BOS]"}
9
+ },
10
+ "encoding_rule": "token_id = (byte_value % 256) + 3",
11
+ "byte_range": "0-255 maps to token IDs 3-258",
12
+ "max_length": 512,
13
+ "note": "Simple byte-level tokenizer. Input text is converted to bytes, each byte value is offset by 3 to reserve IDs 0-2 for special tokens."
14
+ }