tokenizer_config.json · infernet/hydra at main

hydra / tokenizer_config.json

Upload tokenizer_config.json with huggingface_hub

3cea44b verified 11 days ago

557 Bytes

	{
	"tokenizer_class": "ByteLevelTokenizer",
	"tokenizer_type": "byte_level_with_offset",
	"vocab_size": 32000,
	"special_tokens": {
	"pad_token": {"id": 0, "content": "[PAD]"},
	"eos_token": {"id": 1, "content": "[EOS]"},
	"bos_token": {"id": 2, "content": "[BOS]"}
	},
	"encoding_rule": "token_id = (byte_value % 256) + 3",
	"byte_range": "0-255 maps to token IDs 3-258",
	"max_length": 512,
	"note": "Simple byte-level tokenizer. Input text is converted to bytes, each byte value is offset by 3 to reserve IDs 0-2 for special tokens."
	}