devoppro commited on
Commit
8c29328
·
verified ·
1 Parent(s): eec13c0

Create model/tokenizer.py

Browse files
Files changed (1) hide show
  1. model/tokenizer.py +24 -0
model/tokenizer.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from tokenizers import Tokenizer, models, trainers, pre_tokenizers, processors, decoders
2
+ from tokenizers.normalizers import NFC
3
+ from transformers import PreTrainedTokenizerFast
4
+ from pathlib import Path
5
+
6
+ SPECIAL_TOKENS = [
7
+ "<|pad|>", "<|bos|>", "<|eos|>", "<|unk|>", "<|sep|>",
8
+ "<|fim_prefix|>", "<|fim_middle|>", "<|fim_suffix|>",
9
+ "<|python|>", "<|javascript|>", "<|typescript|>",
10
+ "<|cpp|>", "<|rust|>", "<|go|>", "<|java|>", "<|bash|>",
11
+ ]
12
+
13
+ def get_gpt2_tokenizer_for_code():
14
+ from transformers import AutoTokenizer
15
+ tok = AutoTokenizer.from_pretrained("gpt2")
16
+ tok.pad_token = tok.eos_token
17
+ tok.add_special_tokens({"additional_special_tokens": [
18
+ "<|fim_prefix|>", "<|fim_middle|>", "<|fim_suffix|>",
19
+ "<|python|>", "<|javascript|>", "<|rust|>", "<|go|>",
20
+ ]})
21
+ return tok
22
+
23
+ def load_tokenizer(save_dir="./tokenizer"):
24
+ return PreTrainedTokenizerFast.from_pretrained(save_dir)