| { | |
| "tokenizer_class": "PreTrainedTokenizerFast", | |
| "vocab_size": 65536, | |
| "model_type": "glaurung-binary-tokenizer", | |
| "version": "001", | |
| "encoding": "latin-1", | |
| "description": "BPE tokenizer for binary executables and malware (x86-64, ARM64, Windows PE, Linux ELF)", | |
| "compression_ratio": 2.849, | |
| "training_data": { | |
| "size_gb": 13, | |
| "files": 30738, | |
| "platforms": [ | |
| "Linux (Alpine, Debian, Ubuntu)", | |
| "Windows (8, 10, 11)" | |
| ], | |
| "architectures": [ | |
| "x86-64", | |
| "x86-32", | |
| "ARM64" | |
| ] | |
| }, | |
| "performance": { | |
| "bytes_per_token": 2.849, | |
| "theoretical_efficiency": 0.86, | |
| "improvement_over_32k": 0.099 | |
| }, | |
| "predecessor": "mjbommar/binary-tokenizer-005" | |
| } |