glaurung-binary-tokenizer-001 / tokenizer_config.json
mjbommar's picture
Add tokenizer config
9dda4b4 verified
{
"tokenizer_class": "PreTrainedTokenizerFast",
"vocab_size": 65536,
"model_type": "glaurung-binary-tokenizer",
"version": "001",
"encoding": "latin-1",
"description": "BPE tokenizer for binary executables and malware (x86-64, ARM64, Windows PE, Linux ELF)",
"compression_ratio": 2.849,
"training_data": {
"size_gb": 13,
"files": 30738,
"platforms": [
"Linux (Alpine, Debian, Ubuntu)",
"Windows (8, 10, 11)"
],
"architectures": [
"x86-64",
"x86-32",
"ARM64"
]
},
"performance": {
"bytes_per_token": 2.849,
"theoretical_efficiency": 0.86,
"improvement_over_32k": 0.099
},
"predecessor": "mjbommar/binary-tokenizer-005"
}