mjbommar commited on
Commit
bbc3b0b
·
verified ·
1 Parent(s): ce335b9

Upload binary-tokenizer-001-4k tokenizer

Browse files
Files changed (3) hide show
  1. README.md +31 -3
  2. analysis_results.json +3 -131
  3. tokenizer.json +0 -0
README.md CHANGED
@@ -18,9 +18,19 @@ library_name: tokenizers
18
 
19
  # binary-tokenizer-001-4k
20
 
21
- **Model Name**: `binary-tokenizer-001-4k`
22
- **HuggingFace**: [`mjbommar/binary-tokenizer-001-4k`](https://huggingface.co/mjbommar/binary-tokenizer-001-4k)
23
- **Vocabulary Size**: 4,096 tokens (2^12)
 
 
 
 
 
 
 
 
 
 
24
 
25
  ---
26
 
@@ -184,6 +194,24 @@ Decoded: 7f454c4602010100000000000000000003003e000100000030...
184
 
185
  ---
186
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
187
  **Generated**: November 12, 2025
188
  **Training Script**: `train_tokenizers.sh`
189
  **Analysis Script**: `analyze_tokenizer.py`
 
18
 
19
  # binary-tokenizer-001-4k
20
 
21
+ A cross-platform BPE tokenizer for binary executables and machine code. Trained on 13 GB of diverse binaries spanning Linux, Windows, macOS, and Android platforms.
22
+
23
+ **🔗 Model**: [`mjbommar/binary-tokenizer-001-4k`](https://huggingface.co/mjbommar/binary-tokenizer-001-4k)
24
+ **📊 Dataset**: [`mjbommar/binary-30k-tokenized`](https://huggingface.co/datasets/mjbommar/binary-30k-tokenized)
25
+ **📄 Paper**: *Binary BPE: Cross-Platform Tokenization for Binary Analysis* (arXiv preprint coming soon)
26
+
27
+ ## Overview
28
+
29
+ - **Vocabulary Size**: 4,096 tokens (2^12)
30
+ - **Token Composition**: 256 base bytes + 3,833 learned merges + 7 special tokens
31
+ - **Average Token Length**: 3.000 bytes
32
+ - **3-byte Instructions**: 20.6% of vocabulary (841 tokens)
33
+ - **Compression Ratio**: ~2.0 bytes/token on typical binaries
34
 
35
  ---
36
 
 
194
 
195
  ---
196
 
197
+ ## Citation
198
+
199
+ If you use this tokenizer in your research, please cite:
200
+
201
+ ```bibtex
202
+ @article{bommarito2025binarybpe,
203
+ title={Binary BPE: Cross-Platform Tokenization for Binary Analysis},
204
+ author={Bommarito II, Michael J.},
205
+ journal={arXiv preprint},
206
+ year={2025},
207
+ note={Preprint coming soon}
208
+ }
209
+ ```
210
+
211
+ **Author**: Michael J. Bommarito II ([michael.bommarito@gmail.com](mailto:michael.bommarito@gmail.com))
212
+
213
+ ---
214
+
215
  **Generated**: November 12, 2025
216
  **Training Script**: `train_tokenizers.sh`
217
  **Analysis Script**: `analyze_tokenizer.py`
analysis_results.json CHANGED
@@ -1,131 +1,3 @@
1
- {
2
- "vocab_size": {
3
- "total": 4089,
4
- "total_with_special": 4096,
5
- "base": 256,
6
- "merges": 3833,
7
- "special": 7,
8
- "is_power_of_2": true,
9
- "power": 12,
10
- "matches_expected": true
11
- },
12
- "reachability": {
13
- "valid_merges": 3833,
14
- "invalid_merges": 0,
15
- "reachable": 4089,
16
- "unreachable": 0,
17
- "all_reachable": true
18
- },
19
- "length_dist": {
20
- "distribution": {
21
- "1": 256,
22
- "2": 1974,
23
- "3": 841,
24
- "4": 649,
25
- "5": 95,
26
- "6": 86,
27
- "7": 40,
28
- "8": 59,
29
- "9": 19,
30
- "10": 11,
31
- "11": 7,
32
- "12": 15,
33
- "13": 3,
34
- "14": 7,
35
- "15": 5,
36
- "16": 11,
37
- "17": 2,
38
- "19": 1,
39
- "21": 1,
40
- "23": 1,
41
- "32": 5,
42
- "20": 1
43
- },
44
- "avg_length": 3.0004891171435557,
45
- "min_length": 1,
46
- "max_length": 32,
47
- "length_3_count": 841,
48
- "length_3_percent": 20.56737588652482
49
- },
50
- "byte_content": {
51
- "null_tokens": 1094,
52
- "ascii_printable": 896,
53
- "ascii_only": 1879,
54
- "high_byte": 2210,
55
- "mixed": 965,
56
- "byte_distribution": {
57
- "0": 2468,
58
- "255": 404,
59
- "72": 340,
60
- "1": 287,
61
- "32": 251,
62
- "3": 235,
63
- "139": 233,
64
- "204": 170,
65
- "36": 160,
66
- "64": 159,
67
- "2": 155,
68
- "116": 155,
69
- "65": 148,
70
- "249": 144,
71
- "128": 123,
72
- "4": 122,
73
- "101": 122,
74
- "137": 121,
75
- "15": 118,
76
- "145": 103,
77
- "97": 93,
78
- "8": 92,
79
- "68": 91,
80
- "131": 88,
81
- "232": 87,
82
- "114": 87,
83
- "16": 83,
84
- "170": 80,
85
- "110": 79,
86
- "111": 78,
87
- "105": 77,
88
- "84": 75,
89
- "115": 75,
90
- "169": 72,
91
- "192": 71,
92
- "99": 70,
93
- "117": 68,
94
- "141": 68,
95
- "6": 67,
96
- "76": 66,
97
- "69": 66,
98
- "108": 66,
99
- "31": 65,
100
- "5": 61,
101
- "33": 60,
102
- "112": 59,
103
- "100": 58,
104
- "48": 57,
105
- "224": 57,
106
- "95": 57
107
- }
108
- },
109
- "diversity": {
110
- "1": {
111
- "learned": 256,
112
- "possible": 256,
113
- "coverage": 100.0
114
- },
115
- "2": {
116
- "learned": 1974,
117
- "possible": 65536,
118
- "coverage": 3.0120849609375
119
- },
120
- "3": {
121
- "learned": 841,
122
- "possible": 16777216,
123
- "coverage": 0.0050127506256103516
124
- },
125
- "4": {
126
- "learned": 649,
127
- "possible": 4294967296,
128
- "coverage": 1.5110708773136139e-05
129
- }
130
- }
131
- }
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bcd88311fab5bda721656bc5c219845df171f81893d6cba28f1e5e77769af8b0
3
+ size 2331
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
tokenizer.json CHANGED
The diff for this file is too large to render. See raw diff