mjbommar
/

binary-tokenizer-001-4k

@@ -18,9 +18,19 @@ library_name: tokenizers
 # binary-tokenizer-001-4k
-**Model Name**: `binary-tokenizer-001-4k`
-**HuggingFace**: [`mjbommar/binary-tokenizer-001-4k`](https://huggingface.co/mjbommar/binary-tokenizer-001-4k)
-**Vocabulary Size**: 4,096 tokens (2^12)
 ---
@@ -184,6 +194,24 @@ Decoded: 7f454c4602010100000000000000000003003e000100000030...
 ---
 **Generated**: November 12, 2025
 **Training Script**: `train_tokenizers.sh`
 **Analysis Script**: `analyze_tokenizer.py`

 # binary-tokenizer-001-4k
+A cross-platform BPE tokenizer for binary executables and machine code. Trained on 13 GB of diverse binaries spanning Linux, Windows, macOS, and Android platforms.
+**🔗 Model**: [`mjbommar/binary-tokenizer-001-4k`](https://huggingface.co/mjbommar/binary-tokenizer-001-4k)
+**📊 Dataset**: [`mjbommar/binary-30k-tokenized`](https://huggingface.co/datasets/mjbommar/binary-30k-tokenized)
+**📄 Paper**: *Binary BPE: Cross-Platform Tokenization for Binary Analysis* (arXiv preprint coming soon)
+## Overview
+- **Vocabulary Size**: 4,096 tokens (2^12)
+- **Token Composition**: 256 base bytes + 3,833 learned merges + 7 special tokens
+- **Average Token Length**: 3.000 bytes
+- **3-byte Instructions**: 20.6% of vocabulary (841 tokens)
+- **Compression Ratio**: ~2.0 bytes/token on typical binaries
 ---
 ---
+## Citation
+If you use this tokenizer in your research, please cite:
+```bibtex
+@article{bommarito2025binarybpe,
+  title={Binary BPE: Cross-Platform Tokenization for Binary Analysis},
+  author={Bommarito II, Michael J.},
+  journal={arXiv preprint},
+  year={2025},
+  note={Preprint coming soon}
+}
+```
+**Author**: Michael J. Bommarito II ([michael.bommarito@gmail.com](mailto:michael.bommarito@gmail.com))
+---
 **Generated**: November 12, 2025
 **Training Script**: `train_tokenizers.sh`
 **Analysis Script**: `analyze_tokenizer.py`

analysis_results.json CHANGED Viewed

@@ -1,131 +1,3 @@
-{
-  "vocab_size": {
-    "total": 4089,
-    "total_with_special": 4096,
-    "base": 256,
-    "merges": 3833,
-    "special": 7,
-    "is_power_of_2": true,
-    "power": 12,
-    "matches_expected": true
-  },
-  "reachability": {
-    "valid_merges": 3833,
-    "invalid_merges": 0,
-    "reachable": 4089,
-    "unreachable": 0,
-    "all_reachable": true
-  },
-  "length_dist": {
-    "distribution": {
-      "1": 256,
-      "2": 1974,
-      "3": 841,
-      "4": 649,
-      "5": 95,
-      "6": 86,
-      "7": 40,
-      "8": 59,
-      "9": 19,
-      "10": 11,
-      "11": 7,
-      "12": 15,
-      "13": 3,
-      "14": 7,
-      "15": 5,
-      "16": 11,
-      "17": 2,
-      "19": 1,
-      "21": 1,
-      "23": 1,
-      "32": 5,
-      "20": 1
-    },
-    "avg_length": 3.0004891171435557,
-    "min_length": 1,
-    "max_length": 32,
-    "length_3_count": 841,
-    "length_3_percent": 20.56737588652482
-  },
-  "byte_content": {
-    "null_tokens": 1094,
-    "ascii_printable": 896,
-    "ascii_only": 1879,
-    "high_byte": 2210,
-    "mixed": 965,
-    "byte_distribution": {
-      "0": 2468,
-      "255": 404,
-      "72": 340,
-      "1": 287,
-      "32": 251,
-      "3": 235,
-      "139": 233,
-      "204": 170,
-      "36": 160,
-      "64": 159,
-      "2": 155,
-      "116": 155,
-      "65": 148,
-      "249": 144,
-      "128": 123,
-      "4": 122,
-      "101": 122,
-      "137": 121,
-      "15": 118,
-      "145": 103,
-      "97": 93,
-      "8": 92,
-      "68": 91,
-      "131": 88,
-      "232": 87,
-      "114": 87,
-      "16": 83,
-      "170": 80,
-      "110": 79,
-      "111": 78,
-      "105": 77,
-      "84": 75,
-      "115": 75,
-      "169": 72,
-      "192": 71,
-      "99": 70,
-      "117": 68,
-      "141": 68,
-      "6": 67,
-      "76": 66,
-      "69": 66,
-      "108": 66,
-      "31": 65,
-      "5": 61,
-      "33": 60,
-      "112": 59,
-      "100": 58,
-      "48": 57,
-      "224": 57,
-      "95": 57
-    }
-  },
-  "diversity": {
-    "1": {
-      "learned": 256,
-      "possible": 256,
-      "coverage": 100.0
-    },
-    "2": {
-      "learned": 1974,
-      "possible": 65536,
-      "coverage": 3.0120849609375
-    },
-    "3": {
-      "learned": 841,
-      "possible": 16777216,
-      "coverage": 0.0050127506256103516
-    },
-    "4": {
-      "learned": 649,
-      "possible": 4294967296,
-      "coverage": 1.5110708773136139e-05
-    }
-  }
-}

+version https://git-lfs.github.com/spec/v1
+oid sha256:bcd88311fab5bda721656bc5c219845df171f81893d6cba28f1e5e77769af8b0
+size 2331

tokenizer.json CHANGED Viewed

The diff for this file is too large to render. See raw diff