aframson commited on
Commit
9d6689e
·
1 Parent(s): 4e47b59
Files changed (1) hide show
  1. tokenizeConfig.py +40 -22
tokenizeConfig.py CHANGED
@@ -3,26 +3,45 @@ from tokenizers import Tokenizer, models, pre_tokenizers, trainers, decoders
3
  import json
4
  from typing import List, Optional, Union, Dict
5
  from transformers.tokenization_utils_base import EncodedInput, BatchEncoding
6
- from transformers import PreTrainedTokenizer
7
- from transformers.utils import logging, PaddingStrategy
8
-
9
 
10
  class OBITokenizer(PreTrainedTokenizer):
11
- def __init__(self, auto_map={"AutoTokenizer": ["tokenizeConfig.OBITokenizer"]},tokenizer_class= "OBITokenizer",**kwargs):
12
- # Initialize your tokenizer with the auto_map parameter if needed
13
- self.auto_map=auto_map
14
- self.tokenizer_class=tokenizer_class
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  # Initialize a BPE model for tokenization
16
  bpe_model = models.BPE()
17
- # Initialize the tokenizer
18
  self.tokenizer = Tokenizer(bpe_model)
 
19
  # Add pre-tokenization and decoding steps if needed
20
  self.tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
21
  self.tokenizer.decoder = decoders.ByteLevel()
22
- super().__init__(**kwargs)
23
 
24
  # Set the padding token
25
  self.pad_token = "[PAD]"
 
26
  # Set the special tokens
27
  self.cls_token = "[CLS]"
28
  self.sep_token = "[SEP]"
@@ -31,30 +50,29 @@ class OBITokenizer(PreTrainedTokenizer):
31
  self.bos_token = "[CLS]"
32
  self.eos_token = "[SEP]"
33
  self.pad_token = "[PAD]"
34
-
 
 
35
 
36
  def _pad(
37
- self,
38
- encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
39
- max_length: Optional[int] = None,
40
- padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
41
- pad_to_multiple_of: Optional[int] = None,
42
- return_attention_mask: Optional[bool] = None,
43
  ) -> dict:
44
  # Modify the _pad method as needed for OBITokenizer
45
  # You can inherit the implementation from ChatGLMTokenizer and customize it further
46
  return super()._pad(encoded_inputs, max_length, padding_strategy, pad_to_multiple_of, return_attention_mask)
47
-
48
 
49
- def train(self, files,save_path):
50
  # Training: Fit the tokenizer on your text data
51
  trainer = trainers.BpeTrainer(special_tokens=["[PAD]", "[CLS]", "[SEP]", "[MASK]", "[UNK]"])
52
  self.tokenizer.train(trainer=trainer, files=files)
53
  # Save the trained tokenizer to a file
54
  self.tokenizer.save(save_path)
55
 
56
-
57
-
58
  def save_config(self, config_file):
59
  # Serialize the tokenizer's config to a JSON file
60
  config_dict = {
@@ -62,7 +80,7 @@ class OBITokenizer(PreTrainedTokenizer):
62
  "vocab_size": self.tokenizer.get_vocab_size(),
63
  "tokenizer_class": "OBITokenizer",
64
  "auto_map": {
65
- "AutoTokenizer": ["tokenizeConfig.OBITokenizer","null"]
66
  },
67
  "bos_token": "[CLS]",
68
  "eos_token": "[SEP]",
@@ -81,4 +99,4 @@ class OBITokenizer(PreTrainedTokenizer):
81
 
82
  def decode(self, ids):
83
  # Decode IDs to text using the custom tokenizer
84
- return self.tokenizer.decode(ids)
 
3
  import json
4
  from typing import List, Optional, Union, Dict
5
  from transformers.tokenization_utils_base import EncodedInput, BatchEncoding
6
+ from transformers.utils import PaddingStrategy
 
 
7
 
8
  class OBITokenizer(PreTrainedTokenizer):
9
+ def __init__(
10
+ self,
11
+ vocab_file,
12
+ unk_token="<unk>",
13
+ bos_token="<s>",
14
+ eos_token="</s>",
15
+ pad_token=None,
16
+ add_bos_token=True,
17
+ add_eos_token=False,
18
+ clean_up_tokenization_spaces=False,
19
+ auto_map={"AutoTokenizer": ["tokenizeConfig.OBITokenizer"]},
20
+ tokenizer_class="OBITokenizer",
21
+ **kwargs,
22
+ ):
23
+ super().__init__(
24
+ unk_token=unk_token,
25
+ bos_token=bos_token,
26
+ eos_token=eos_token,
27
+ pad_token=pad_token,
28
+ add_bos_token=add_bos_token,
29
+ add_eos_token=add_eos_token,
30
+ clean_up_tokenization_spaces=clean_up_tokenization_spaces,
31
+ **kwargs,
32
+ )
33
+
34
  # Initialize a BPE model for tokenization
35
  bpe_model = models.BPE()
 
36
  self.tokenizer = Tokenizer(bpe_model)
37
+
38
  # Add pre-tokenization and decoding steps if needed
39
  self.tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
40
  self.tokenizer.decoder = decoders.ByteLevel()
 
41
 
42
  # Set the padding token
43
  self.pad_token = "[PAD]"
44
+
45
  # Set the special tokens
46
  self.cls_token = "[CLS]"
47
  self.sep_token = "[SEP]"
 
50
  self.bos_token = "[CLS]"
51
  self.eos_token = "[SEP]"
52
  self.pad_token = "[PAD]"
53
+
54
+ # Load the vocabulary file
55
+ self.tokenizer.get_vocab().add_special_tokens([self.cls_token, self.sep_token, self.unk_token, self.mask_token])
56
 
57
  def _pad(
58
+ self,
59
+ encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
60
+ max_length: Optional[int] = None,
61
+ padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
62
+ pad_to_multiple_of: Optional[int] = None,
63
+ return_attention_mask: Optional[bool] = None,
64
  ) -> dict:
65
  # Modify the _pad method as needed for OBITokenizer
66
  # You can inherit the implementation from ChatGLMTokenizer and customize it further
67
  return super()._pad(encoded_inputs, max_length, padding_strategy, pad_to_multiple_of, return_attention_mask)
 
68
 
69
+ def train(self, files, save_path):
70
  # Training: Fit the tokenizer on your text data
71
  trainer = trainers.BpeTrainer(special_tokens=["[PAD]", "[CLS]", "[SEP]", "[MASK]", "[UNK]"])
72
  self.tokenizer.train(trainer=trainer, files=files)
73
  # Save the trained tokenizer to a file
74
  self.tokenizer.save(save_path)
75
 
 
 
76
  def save_config(self, config_file):
77
  # Serialize the tokenizer's config to a JSON file
78
  config_dict = {
 
80
  "vocab_size": self.tokenizer.get_vocab_size(),
81
  "tokenizer_class": "OBITokenizer",
82
  "auto_map": {
83
+ "AutoTokenizer": ["tokenizeConfig.OBITokenizer", "null"]
84
  },
85
  "bos_token": "[CLS]",
86
  "eos_token": "[SEP]",
 
99
 
100
  def decode(self, ids):
101
  # Decode IDs to text using the custom tokenizer
102
+ return self.tokenizer.decode(ids)