File size: 709 Bytes
9dda4b4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
{
  "tokenizer_class": "PreTrainedTokenizerFast",
  "vocab_size": 65536,
  "model_type": "glaurung-binary-tokenizer",
  "version": "001",
  "encoding": "latin-1",
  "description": "BPE tokenizer for binary executables and malware (x86-64, ARM64, Windows PE, Linux ELF)",
  "compression_ratio": 2.849,
  "training_data": {
    "size_gb": 13,
    "files": 30738,
    "platforms": [
      "Linux (Alpine, Debian, Ubuntu)",
      "Windows (8, 10, 11)"
    ],
    "architectures": [
      "x86-64",
      "x86-32",
      "ARM64"
    ]
  },
  "performance": {
    "bytes_per_token": 2.849,
    "theoretical_efficiency": 0.86,
    "improvement_over_32k": 0.099
  },
  "predecessor": "mjbommar/binary-tokenizer-005"
}