saracandu commited on
Commit
564dda0
·
verified ·
1 Parent(s): eb57d61

Update tokenizer.py

Browse files
Files changed (1) hide show
  1. tokenizer.py +26 -39
tokenizer.py CHANGED
@@ -10,10 +10,9 @@ logger = logging.get_logger(__name__)
10
  def load_json(path: str) -> Union[Dict, List]:
11
  """
12
  Load a JSON file from the given path.
13
-
14
  Args:
15
  path (str): The path to the JSON file to be loaded.
16
-
17
  Returns:
18
  Union[Dict, List]: The parsed content of the JSON file, which could be a dictionary or a list.
19
  """
@@ -24,16 +23,14 @@ def load_json(path: str) -> Union[Dict, List]:
24
  class STLTokenizer(PreTrainedTokenizer):
25
  """
26
  A custom tokenizer class that extends `PreTrainedTokenizer` to handle a specific vocabulary and tokenization process.
27
-
28
- This tokenizer can load a vocabulary from a JSON file, tokenize text, convert tokens to IDs,
29
  and handle padding and special tokens.
30
  """
31
 
32
- def __init__(self, vocab_path: str = 'vocab.json', unk_token: str = "unk", pad_token: str = "pad",
33
- bos_token: str = "/s", eos_token: str = "s", model_max_length = 512):
34
  """
35
  Initializes the STLTokenizer with a given vocabulary and special tokens.
36
-
37
  Args:
38
  vocab_path (str): The path to the JSON file containing the vocabulary.
39
  unk_token (str, optional): The token used for unknown words. Defaults to "unk".
@@ -49,11 +46,19 @@ class STLTokenizer(PreTrainedTokenizer):
49
  self.model_max_length = model_max_length
50
  self.id_to_token = {v: k for k, v in self.vocab.items()} # Reverse mapping
51
 
 
 
 
 
 
 
 
 
 
52
  @property
53
  def vocab_size(self) -> int:
54
  """
55
  Returns the size of the vocabulary.
56
-
57
  Returns:
58
  int: The number of tokens in the vocabulary.
59
  """
@@ -62,11 +67,9 @@ class STLTokenizer(PreTrainedTokenizer):
62
  def prepad_sequence(self, sequence, space_token = ' ', new_space_token = '@', undo = False):
63
  """
64
  Replaces spaces in the input sequence with a specified token.
65
-
66
  Args:
67
  sequence (str): The input sequence.
68
  undo (bool): If True, replace the padding token with spaces. Defaults to False, which pads the spaces.
69
-
70
  Returns:
71
  str: The preprocessed sequence with spaces or padding tokens replaced.
72
  """
@@ -78,10 +81,8 @@ class STLTokenizer(PreTrainedTokenizer):
78
  def add_bos_eos(self, sequence: str) -> str:
79
  """
80
  Aggiunge i token BOS all'inizio e EOS alla fine della sequenza.
81
-
82
  Args:
83
  sequence (str): La sequenza di input.
84
-
85
  Returns:
86
  str: La sequenza con i token BOS ed EOS.
87
  """
@@ -90,19 +91,16 @@ class STLTokenizer(PreTrainedTokenizer):
90
  def tokenize(self, text: str) -> List[str]:
91
  """
92
  Tokenizes the input text into a list of tokens.
93
-
94
- The method preprocesses the input text by replacing spaces with padding tokens and then tries to
95
  find the longest possible match for each substring in the vocabulary.
96
-
97
  Args:
98
  text (str): The input text to be tokenized.
99
-
100
  Returns:
101
  List[str]: A list of tokens representing the tokenized text.
102
  """
103
  text = self.add_bos_eos(text)
104
  text = self.prepad_sequence(text)
105
-
106
  tokens = []
107
  i = 0
108
  while i < len(text):
@@ -123,10 +121,8 @@ class STLTokenizer(PreTrainedTokenizer):
123
  def convert_tokens_to_ids(self, tokens: List[str]) -> List[int]:
124
  """
125
  Converts a list of tokens into a list of token IDs.
126
-
127
  Args:
128
  tokens (List[str]): A list of tokens to be converted into IDs.
129
-
130
  Returns:
131
  List[int]: A list of corresponding token IDs.
132
  """
@@ -135,10 +131,8 @@ class STLTokenizer(PreTrainedTokenizer):
135
  def convert_ids_to_tokens(self, ids: List[int]) -> List[str]:
136
  """
137
  Converts a list of token IDs into a list of tokens.
138
-
139
  Args:
140
  ids (List[int]): A list of token IDs to be converted into tokens.
141
-
142
  Returns:
143
  List[str]: A list of corresponding tokens.
144
  """
@@ -147,14 +141,14 @@ class STLTokenizer(PreTrainedTokenizer):
147
  def encode(self, sequence: str) -> List[int]:
148
  """
149
  Encodes a string sequence into a list of token IDs.
150
-
151
- This method tokenizes the input sequence using the `tokenize` method,
152
- and then converts the resulting tokens into their corresponding token IDs
153
  using the `convert_tokens_to_ids` method.
154
-
155
  Args:
156
  sequence (str): The input sequence (text) to be encoded.
157
-
158
  Returns:
159
  List[int]: A list of token IDs corresponding to the input sequence.
160
  """
@@ -163,8 +157,8 @@ class STLTokenizer(PreTrainedTokenizer):
163
 
164
  def postpad_sequence(self, sequence, pad_token_id):
165
  """
166
- Fills the sequence up to max_length padding elements
167
- """
168
  num_extra_elements = self.model_max_length - len(sequence) -1
169
  if num_extra_elements > 0:
170
  sequence.extend([pad_token_id] * num_extra_elements)
@@ -173,14 +167,11 @@ class STLTokenizer(PreTrainedTokenizer):
173
  def decode(self, token_ids: List[int]) -> str:
174
  """
175
  Decodes a list of token IDs into a string of text.
176
-
177
- The method converts the IDs to tokens and joins them to form a string.
178
  It also restores the original spaces or padding tokens if `undo` is True.
179
-
180
  Args:
181
  token_ids (List[int]): A list of token IDs to be decoded.
182
  skip_special_tokens (bool, optional): Whether to skip special tokens during decoding. Defaults to False.
183
-
184
  Returns:
185
  str: The decoded string.
186
  """
@@ -190,16 +181,13 @@ class STLTokenizer(PreTrainedTokenizer):
190
 
191
  def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
192
  """
193
- Saves the tokenizer's vocabulary to a file.
194
- Useful only when the vocabulary has to be retrieved and is not given
195
  (thus this is not the case: here to further improvements with sentencepiece).
196
-
197
- This method saves the vocabulary to a JSON file in the specified directory.
198
-
199
  Args:
200
  save_directory (str): The directory where the vocabulary file will be saved.
201
  filename_prefix (Optional[str]): An optional prefix for the filename.
202
-
203
  Returns:
204
  Tuple[str]: A tuple containing the path to the saved vocabulary file.
205
  """
@@ -211,7 +199,6 @@ class STLTokenizer(PreTrainedTokenizer):
211
  def get_vocab(self) -> dict:
212
  """
213
  Retrieves the vocabulary used by the tokenizer.
214
-
215
  Returns:
216
  dict: The vocabulary as a dictionary.
217
  """
 
10
  def load_json(path: str) -> Union[Dict, List]:
11
  """
12
  Load a JSON file from the given path.
 
13
  Args:
14
  path (str): The path to the JSON file to be loaded.
15
+
16
  Returns:
17
  Union[Dict, List]: The parsed content of the JSON file, which could be a dictionary or a list.
18
  """
 
23
  class STLTokenizer(PreTrainedTokenizer):
24
  """
25
  A custom tokenizer class that extends `PreTrainedTokenizer` to handle a specific vocabulary and tokenization process.
26
+ This tokenizer can load a vocabulary from a JSON file, tokenize text, convert tokens to IDs,
 
27
  and handle padding and special tokens.
28
  """
29
 
30
+ def __init__(self, vocab_path: str = 'vocab.json', unk_token: str = "unk", pad_token: str = "pad",
31
+ bos_token: str = "/s", eos_token: str = "s", model_max_length = 512, **kwargs):
32
  """
33
  Initializes the STLTokenizer with a given vocabulary and special tokens.
 
34
  Args:
35
  vocab_path (str): The path to the JSON file containing the vocabulary.
36
  unk_token (str, optional): The token used for unknown words. Defaults to "unk".
 
46
  self.model_max_length = model_max_length
47
  self.id_to_token = {v: k for k, v in self.vocab.items()} # Reverse mapping
48
 
49
+ super().__init__(
50
+ unk_token=unk_token,
51
+ pad_token=pad_token,
52
+ bos_token=bos_token,
53
+ eos_token=eos_token,
54
+ model_max_length=model_max_length,
55
+ **kwargs
56
+ )
57
+
58
  @property
59
  def vocab_size(self) -> int:
60
  """
61
  Returns the size of the vocabulary.
 
62
  Returns:
63
  int: The number of tokens in the vocabulary.
64
  """
 
67
  def prepad_sequence(self, sequence, space_token = ' ', new_space_token = '@', undo = False):
68
  """
69
  Replaces spaces in the input sequence with a specified token.
 
70
  Args:
71
  sequence (str): The input sequence.
72
  undo (bool): If True, replace the padding token with spaces. Defaults to False, which pads the spaces.
 
73
  Returns:
74
  str: The preprocessed sequence with spaces or padding tokens replaced.
75
  """
 
81
  def add_bos_eos(self, sequence: str) -> str:
82
  """
83
  Aggiunge i token BOS all'inizio e EOS alla fine della sequenza.
 
84
  Args:
85
  sequence (str): La sequenza di input.
 
86
  Returns:
87
  str: La sequenza con i token BOS ed EOS.
88
  """
 
91
  def tokenize(self, text: str) -> List[str]:
92
  """
93
  Tokenizes the input text into a list of tokens.
94
+ The method preprocesses the input text by replacing spaces with padding tokens and then tries to
 
95
  find the longest possible match for each substring in the vocabulary.
 
96
  Args:
97
  text (str): The input text to be tokenized.
 
98
  Returns:
99
  List[str]: A list of tokens representing the tokenized text.
100
  """
101
  text = self.add_bos_eos(text)
102
  text = self.prepad_sequence(text)
103
+
104
  tokens = []
105
  i = 0
106
  while i < len(text):
 
121
  def convert_tokens_to_ids(self, tokens: List[str]) -> List[int]:
122
  """
123
  Converts a list of tokens into a list of token IDs.
 
124
  Args:
125
  tokens (List[str]): A list of tokens to be converted into IDs.
 
126
  Returns:
127
  List[int]: A list of corresponding token IDs.
128
  """
 
131
  def convert_ids_to_tokens(self, ids: List[int]) -> List[str]:
132
  """
133
  Converts a list of token IDs into a list of tokens.
 
134
  Args:
135
  ids (List[int]): A list of token IDs to be converted into tokens.
 
136
  Returns:
137
  List[str]: A list of corresponding tokens.
138
  """
 
141
  def encode(self, sequence: str) -> List[int]:
142
  """
143
  Encodes a string sequence into a list of token IDs.
144
+
145
+ This method tokenizes the input sequence using the `tokenize` method,
146
+ and then converts the resulting tokens into their corresponding token IDs
147
  using the `convert_tokens_to_ids` method.
148
+
149
  Args:
150
  sequence (str): The input sequence (text) to be encoded.
151
+
152
  Returns:
153
  List[int]: A list of token IDs corresponding to the input sequence.
154
  """
 
157
 
158
  def postpad_sequence(self, sequence, pad_token_id):
159
  """
160
+ Fills the sequence up to max_length padding elements
161
+ """
162
  num_extra_elements = self.model_max_length - len(sequence) -1
163
  if num_extra_elements > 0:
164
  sequence.extend([pad_token_id] * num_extra_elements)
 
167
  def decode(self, token_ids: List[int]) -> str:
168
  """
169
  Decodes a list of token IDs into a string of text.
170
+ The method converts the IDs to tokens and joins them to form a string.
 
171
  It also restores the original spaces or padding tokens if `undo` is True.
 
172
  Args:
173
  token_ids (List[int]): A list of token IDs to be decoded.
174
  skip_special_tokens (bool, optional): Whether to skip special tokens during decoding. Defaults to False.
 
175
  Returns:
176
  str: The decoded string.
177
  """
 
181
 
182
  def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
183
  """
184
+ Saves the tokenizer's vocabulary to a file.
185
+ Useful only when the vocabulary has to be retrieved and is not given
186
  (thus this is not the case: here to further improvements with sentencepiece).
187
+ This method saves the vocabulary to a JSON file in the specified directory.
 
 
188
  Args:
189
  save_directory (str): The directory where the vocabulary file will be saved.
190
  filename_prefix (Optional[str]): An optional prefix for the filename.
 
191
  Returns:
192
  Tuple[str]: A tuple containing the path to the saved vocabulary file.
193
  """
 
199
  def get_vocab(self) -> dict:
200
  """
201
  Retrieves the vocabulary used by the tokenizer.
 
202
  Returns:
203
  dict: The vocabulary as a dictionary.
204
  """