anthonym21
/

json-tokenizer-structured

+"""
+json_tokenizer — A tokenizer optimized for JSON structures.
+Architecture:
+  - Structural tokens: single-token representations for JSON grammar ({, }, [, ], :, ,)
+  - Key tokens: deduplicated key vocabulary with Key() wrapper
+  - Value BPE: byte-pair encoding trained on JSON string/number values
+  - Type tokens: explicit type markers for faithful roundtrip encoding
+Delivers 5-15% fewer tokens than cl100k_base on schema-repetitive JSON
+with a 90x smaller vocabulary, and lossless roundtrip fidelity.
+"""
+from json_tokenizer.tokenizer import JSONTokenizer
+from json_tokenizer.bpe import BPETrainer
+__version__ = "0.2.0"
+__all__ = ["JSONTokenizer", "BPETrainer"]
+try:
+    from json_tokenizer.hf_compat import JSONPreTrainedTokenizer
+    __all__.append("JSONPreTrainedTokenizer")
+except ImportError:
+    pass  # transformers not installed

Upload json_tokenizer/__init__.py with huggingface_hub