hey-shiv commited on
Commit
e6bb431
·
verified ·
1 Parent(s): 5b02b35

Upload train_tokenizer.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. train_tokenizer.py +47 -0
train_tokenizer.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+
3
+ from tokenizers import Tokenizer
4
+ from tokenizers.decoders import ByteLevel as ByteLevelDecoder
5
+ from tokenizers.models import BPE
6
+ from tokenizers.pre_tokenizers import ByteLevel
7
+ from tokenizers.processors import ByteLevel as ByteLevelProcessor
8
+ from tokenizers.trainers import BpeTrainer
9
+
10
+ from download_dataset import DATA_PATH, ensure_dataset_exists
11
+
12
+
13
+ TOKENIZER_PATH = Path(__file__).resolve().with_name("tokenizer.json")
14
+ VOCAB_SIZE = 2000
15
+ SPECIAL_TOKENS = [
16
+ "[PAD]",
17
+ "[UNK]",
18
+ "[BOS]",
19
+ "[EOS]",
20
+ ]
21
+
22
+
23
+ def main() -> None:
24
+ data_path = ensure_dataset_exists(DATA_PATH)
25
+
26
+ tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
27
+ tokenizer.pre_tokenizer = ByteLevel(add_prefix_space=False)
28
+ tokenizer.decoder = ByteLevelDecoder()
29
+ tokenizer.post_processor = ByteLevelProcessor(trim_offsets=True)
30
+
31
+ trainer = BpeTrainer(
32
+ vocab_size=VOCAB_SIZE,
33
+ min_frequency=2,
34
+ special_tokens=SPECIAL_TOKENS,
35
+ initial_alphabet=ByteLevel.alphabet(),
36
+ show_progress=True,
37
+ )
38
+
39
+ tokenizer.train([str(data_path)], trainer)
40
+ tokenizer.save(str(TOKENIZER_PATH))
41
+
42
+ print(f"Saved tokenizer to {TOKENIZER_PATH}")
43
+ print(f"Final vocab size: {tokenizer.get_vocab_size()}")
44
+
45
+
46
+ if __name__ == "__main__":
47
+ main()