anthonym21 commited on
Commit
d54e053
·
verified ·
1 Parent(s): 8ae05a9

Upload json_tokenizer/__init__.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. json_tokenizer/__init__.py +24 -0
json_tokenizer/__init__.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ json_tokenizer — A tokenizer optimized for JSON structures.
3
+
4
+ Architecture:
5
+ - Structural tokens: single-token representations for JSON grammar ({, }, [, ], :, ,)
6
+ - Key tokens: deduplicated key vocabulary with Key() wrapper
7
+ - Value BPE: byte-pair encoding trained on JSON string/number values
8
+ - Type tokens: explicit type markers for faithful roundtrip encoding
9
+
10
+ Delivers 5-15% fewer tokens than cl100k_base on schema-repetitive JSON
11
+ with a 90x smaller vocabulary, and lossless roundtrip fidelity.
12
+ """
13
+
14
+ from json_tokenizer.tokenizer import JSONTokenizer
15
+ from json_tokenizer.bpe import BPETrainer
16
+
17
+ __version__ = "0.2.0"
18
+ __all__ = ["JSONTokenizer", "BPETrainer"]
19
+
20
+ try:
21
+ from json_tokenizer.hf_compat import JSONPreTrainedTokenizer
22
+ __all__.append("JSONPreTrainedTokenizer")
23
+ except ImportError:
24
+ pass # transformers not installed