mjbommar commited on
Commit
9dda4b4
·
verified ·
1 Parent(s): e7ddf05

Add tokenizer config

Browse files
Files changed (1) hide show
  1. tokenizer_config.json +28 -0
tokenizer_config.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "tokenizer_class": "PreTrainedTokenizerFast",
3
+ "vocab_size": 65536,
4
+ "model_type": "glaurung-binary-tokenizer",
5
+ "version": "001",
6
+ "encoding": "latin-1",
7
+ "description": "BPE tokenizer for binary executables and malware (x86-64, ARM64, Windows PE, Linux ELF)",
8
+ "compression_ratio": 2.849,
9
+ "training_data": {
10
+ "size_gb": 13,
11
+ "files": 30738,
12
+ "platforms": [
13
+ "Linux (Alpine, Debian, Ubuntu)",
14
+ "Windows (8, 10, 11)"
15
+ ],
16
+ "architectures": [
17
+ "x86-64",
18
+ "x86-32",
19
+ "ARM64"
20
+ ]
21
+ },
22
+ "performance": {
23
+ "bytes_per_token": 2.849,
24
+ "theoretical_efficiency": 0.86,
25
+ "improvement_over_32k": 0.099
26
+ },
27
+ "predecessor": "mjbommar/binary-tokenizer-005"
28
+ }