Upload tokenizer
Browse files- special_tokens_map.json +6 -0
- tokenizer.py +218 -0
- tokenizer_config.json +50 -0
- vocab.json +37 -0
special_tokens_map.json
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token": "/s",
|
| 3 |
+
"eos_token": "s",
|
| 4 |
+
"pad_token": "pad",
|
| 5 |
+
"unk_token": "unk"
|
| 6 |
+
}
|
tokenizer.py
ADDED
|
@@ -0,0 +1,218 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
import os
|
| 3 |
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
| 4 |
+
from transformers import PreTrainedTokenizer
|
| 5 |
+
from transformers.utils import logging
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
import json
|
| 8 |
+
|
| 9 |
+
logger = logging.get_logger(__name__)
|
| 10 |
+
|
| 11 |
+
from huggingface_hub import hf_hub_download
|
| 12 |
+
import json
|
| 13 |
+
import os
|
| 14 |
+
|
| 15 |
+
def load_json(path, repo_id=None):
|
| 16 |
+
if repo_id:
|
| 17 |
+
path = hf_hub_download(repo_id, path)
|
| 18 |
+
with open(path, "r", encoding="utf-8") as f:
|
| 19 |
+
return json.load(f)
|
| 20 |
+
|
| 21 |
+
def load_json_old(path: str) -> Union[Dict, List]:
|
| 22 |
+
"""
|
| 23 |
+
Load a JSON file from the given path.
|
| 24 |
+
Args:
|
| 25 |
+
path (str): The path to the JSON file to be loaded.
|
| 26 |
+
|
| 27 |
+
Returns:
|
| 28 |
+
Union[Dict, List]: The parsed content of the JSON file, which could be a dictionary or a list.
|
| 29 |
+
"""
|
| 30 |
+
full_path = Path(__file__).parent / path
|
| 31 |
+
with open(full_path, "r", encoding="utf-8") as f:
|
| 32 |
+
return json.load(f)
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
class STLTokenizer(PreTrainedTokenizer):
|
| 36 |
+
"""
|
| 37 |
+
A custom tokenizer class that extends `PreTrainedTokenizer` to handle a specific vocabulary and tokenization process.
|
| 38 |
+
This tokenizer can load a vocabulary from a JSON file, tokenize text, convert tokens to IDs,
|
| 39 |
+
and handle padding and special tokens.
|
| 40 |
+
"""
|
| 41 |
+
def __init__(self, vocab_path: str = "vocab.json", unk_token: str = "unk", pad_token: str = "pad",
|
| 42 |
+
bos_token: str = "/s", eos_token: str = "s", model_max_length = 512, **kwargs):
|
| 43 |
+
"""
|
| 44 |
+
Initializes the STLTokenizer with a given vocabulary and special tokens.
|
| 45 |
+
Args:
|
| 46 |
+
vocab_path (str): The path to the JSON file containing the vocabulary.
|
| 47 |
+
unk_token (str, optional): The token used for unknown words. Defaults to "unk".
|
| 48 |
+
pad_token (str, optional): The token used for padding. Defaults to "pad".
|
| 49 |
+
bos_token (str, optional): The token used for the beginning of a sequence. Defaults to "/s".
|
| 50 |
+
eos_token (str, optional): The token used for the end of a sequence. Defaults to "s".
|
| 51 |
+
"""
|
| 52 |
+
self.vocab = load_json("vocab.json", repo_id="saracandu/stldec_random_32")
|
| 53 |
+
self.unk_token = unk_token
|
| 54 |
+
self.pad_token = pad_token
|
| 55 |
+
self.bos_token = bos_token
|
| 56 |
+
self.eos_token = eos_token
|
| 57 |
+
self.model_max_length = model_max_length
|
| 58 |
+
self.id_to_token = {v: k for k, v in self.vocab.items()} # Reverse mapping
|
| 59 |
+
|
| 60 |
+
super().__init__(
|
| 61 |
+
unk_token=unk_token,
|
| 62 |
+
pad_token=pad_token,
|
| 63 |
+
bos_token=bos_token,
|
| 64 |
+
eos_token=eos_token,
|
| 65 |
+
model_max_length=model_max_length,
|
| 66 |
+
**kwargs
|
| 67 |
+
)
|
| 68 |
+
|
| 69 |
+
@property
|
| 70 |
+
def vocab_size(self) -> int:
|
| 71 |
+
"""
|
| 72 |
+
Returns the size of the vocabulary.
|
| 73 |
+
Returns:
|
| 74 |
+
int: The number of tokens in the vocabulary.
|
| 75 |
+
"""
|
| 76 |
+
return len(self.vocab)
|
| 77 |
+
|
| 78 |
+
def prepad_sequence(self, sequence, space_token = ' ', new_space_token = '@', undo = False):
|
| 79 |
+
"""
|
| 80 |
+
Replaces spaces in the input sequence with a specified token.
|
| 81 |
+
Args:
|
| 82 |
+
sequence (str): The input sequence.
|
| 83 |
+
undo (bool): If True, replace the padding token with spaces. Defaults to False, which pads the spaces.
|
| 84 |
+
Returns:
|
| 85 |
+
str: The preprocessed sequence with spaces or padding tokens replaced.
|
| 86 |
+
"""
|
| 87 |
+
if undo:
|
| 88 |
+
return sequence.replace(new_space_token, space_token)
|
| 89 |
+
else:
|
| 90 |
+
return sequence.replace(space_token, new_space_token)
|
| 91 |
+
|
| 92 |
+
def add_bos_eos(self, sequence: str) -> str:
|
| 93 |
+
"""
|
| 94 |
+
Aggiunge i token BOS all'inizio e EOS alla fine della sequenza.
|
| 95 |
+
Args:
|
| 96 |
+
sequence (str): La sequenza di input.
|
| 97 |
+
Returns:
|
| 98 |
+
str: La sequenza con i token BOS ed EOS.
|
| 99 |
+
"""
|
| 100 |
+
return f'{self.bos_token} {sequence} {self.eos_token}'
|
| 101 |
+
|
| 102 |
+
def tokenize(self, text: str) -> List[str]:
|
| 103 |
+
"""
|
| 104 |
+
Tokenizes the input text into a list of tokens.
|
| 105 |
+
The method preprocesses the input text by replacing spaces with padding tokens and then tries to
|
| 106 |
+
find the longest possible match for each substring in the vocabulary.
|
| 107 |
+
Args:
|
| 108 |
+
text (str): The input text to be tokenized.
|
| 109 |
+
Returns:
|
| 110 |
+
List[str]: A list of tokens representing the tokenized text.
|
| 111 |
+
"""
|
| 112 |
+
text = self.add_bos_eos(text)
|
| 113 |
+
text = self.prepad_sequence(text)
|
| 114 |
+
|
| 115 |
+
tokens = []
|
| 116 |
+
i = 0
|
| 117 |
+
while i < len(text):
|
| 118 |
+
best_match = None
|
| 119 |
+
for j in range(len(text), i, -1): # Try matching substrings of decreasing length
|
| 120 |
+
subtoken = text[i:j]
|
| 121 |
+
if subtoken in self.vocab:
|
| 122 |
+
best_match = subtoken
|
| 123 |
+
break
|
| 124 |
+
if best_match:
|
| 125 |
+
tokens.append(best_match)
|
| 126 |
+
i += len(best_match)
|
| 127 |
+
else:
|
| 128 |
+
tokens.append(self.unk_token)
|
| 129 |
+
i += 1
|
| 130 |
+
return tokens
|
| 131 |
+
|
| 132 |
+
def convert_tokens_to_ids(self, tokens: List[str]) -> List[int]:
|
| 133 |
+
"""
|
| 134 |
+
Converts a list of tokens into a list of token IDs.
|
| 135 |
+
Args:
|
| 136 |
+
tokens (List[str]): A list of tokens to be converted into IDs.
|
| 137 |
+
Returns:
|
| 138 |
+
List[int]: A list of corresponding token IDs.
|
| 139 |
+
"""
|
| 140 |
+
unk_token_str = str(self.unk_token)
|
| 141 |
+
unk_token_id = self.vocab.get(unk_token_str)
|
| 142 |
+
return [self.vocab.get(token, unk_token_id) for token in tokens]
|
| 143 |
+
|
| 144 |
+
def convert_ids_to_tokens(self, ids: List[int]) -> List[str]:
|
| 145 |
+
"""
|
| 146 |
+
Converts a list of token IDs into a list of tokens.
|
| 147 |
+
Args:
|
| 148 |
+
ids (List[int]): A list of token IDs to be converted into tokens.
|
| 149 |
+
Returns:
|
| 150 |
+
List[str]: A list of corresponding tokens.
|
| 151 |
+
"""
|
| 152 |
+
return [self.id_to_token.get(i, self.unk_token) for i in ids]
|
| 153 |
+
|
| 154 |
+
def encode(self, sequence: str) -> List[int]:
|
| 155 |
+
"""
|
| 156 |
+
Encodes a string sequence into a list of token IDs.
|
| 157 |
+
|
| 158 |
+
This method tokenizes the input sequence using the `tokenize` method,
|
| 159 |
+
and then converts the resulting tokens into their corresponding token IDs
|
| 160 |
+
using the `convert_tokens_to_ids` method.
|
| 161 |
+
|
| 162 |
+
Args:
|
| 163 |
+
sequence (str): The input sequence (text) to be encoded.
|
| 164 |
+
|
| 165 |
+
Returns:
|
| 166 |
+
List[int]: A list of token IDs corresponding to the input sequence.
|
| 167 |
+
"""
|
| 168 |
+
splitted_sequence = self.tokenize(sequence)
|
| 169 |
+
return self.convert_tokens_to_ids(splitted_sequence)
|
| 170 |
+
|
| 171 |
+
def postpad_sequence(self, sequence, pad_token_id):
|
| 172 |
+
"""
|
| 173 |
+
Fills the sequence up to max_length padding elements
|
| 174 |
+
"""
|
| 175 |
+
num_extra_elements = self.model_max_length - len(sequence) -1
|
| 176 |
+
if num_extra_elements > 0:
|
| 177 |
+
sequence.extend([pad_token_id] * num_extra_elements)
|
| 178 |
+
return sequence
|
| 179 |
+
|
| 180 |
+
def decode(self, token_ids: List[int]) -> str:
|
| 181 |
+
"""
|
| 182 |
+
Decodes a list of token IDs into a string of text.
|
| 183 |
+
The method converts the IDs to tokens and joins them to form a string.
|
| 184 |
+
It also restores the original spaces or padding tokens if `undo` is True.
|
| 185 |
+
Args:
|
| 186 |
+
token_ids (List[int]): A list of token IDs to be decoded.
|
| 187 |
+
skip_special_tokens (bool, optional): Whether to skip special tokens during decoding. Defaults to False.
|
| 188 |
+
Returns:
|
| 189 |
+
str: The decoded string.
|
| 190 |
+
"""
|
| 191 |
+
tokens = self.convert_ids_to_tokens(token_ids)
|
| 192 |
+
decoded = "".join(tokens)
|
| 193 |
+
return self.prepad_sequence(decoded, undo=True)
|
| 194 |
+
|
| 195 |
+
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
| 196 |
+
"""
|
| 197 |
+
Saves the tokenizer's vocabulary to a file.
|
| 198 |
+
Useful only when the vocabulary has to be retrieved and is not given
|
| 199 |
+
(thus this is not the case: here to further improvements with sentencepiece).
|
| 200 |
+
This method saves the vocabulary to a JSON file in the specified directory.
|
| 201 |
+
Args:
|
| 202 |
+
save_directory (str): The directory where the vocabulary file will be saved.
|
| 203 |
+
filename_prefix (Optional[str]): An optional prefix for the filename.
|
| 204 |
+
Returns:
|
| 205 |
+
Tuple[str]: A tuple containing the path to the saved vocabulary file.
|
| 206 |
+
"""
|
| 207 |
+
vocab_file = f"{save_directory}/{filename_prefix + '-' if filename_prefix else ''}vocab.json"
|
| 208 |
+
with open(vocab_file, "w", encoding="utf-8") as f:
|
| 209 |
+
json.dump(self.vocab, f, indent=2, ensure_ascii=False)
|
| 210 |
+
return (vocab_file,)
|
| 211 |
+
|
| 212 |
+
def get_vocab(self) -> dict:
|
| 213 |
+
"""
|
| 214 |
+
Retrieves the vocabulary used by the tokenizer.
|
| 215 |
+
Returns:
|
| 216 |
+
dict: The vocabulary as a dictionary.
|
| 217 |
+
"""
|
| 218 |
+
return self.vocab
|
tokenizer_config.json
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"added_tokens_decoder": {
|
| 3 |
+
"0": {
|
| 4 |
+
"content": "unk",
|
| 5 |
+
"lstrip": false,
|
| 6 |
+
"normalized": false,
|
| 7 |
+
"rstrip": false,
|
| 8 |
+
"single_word": false,
|
| 9 |
+
"special": true
|
| 10 |
+
},
|
| 11 |
+
"1": {
|
| 12 |
+
"content": "pad",
|
| 13 |
+
"lstrip": false,
|
| 14 |
+
"normalized": false,
|
| 15 |
+
"rstrip": false,
|
| 16 |
+
"single_word": false,
|
| 17 |
+
"special": true
|
| 18 |
+
},
|
| 19 |
+
"2": {
|
| 20 |
+
"content": "/s",
|
| 21 |
+
"lstrip": false,
|
| 22 |
+
"normalized": false,
|
| 23 |
+
"rstrip": false,
|
| 24 |
+
"single_word": false,
|
| 25 |
+
"special": true
|
| 26 |
+
},
|
| 27 |
+
"3": {
|
| 28 |
+
"content": "s",
|
| 29 |
+
"lstrip": false,
|
| 30 |
+
"normalized": false,
|
| 31 |
+
"rstrip": false,
|
| 32 |
+
"single_word": false,
|
| 33 |
+
"special": true
|
| 34 |
+
}
|
| 35 |
+
},
|
| 36 |
+
"auto_map": {
|
| 37 |
+
"AutoTokenizer": [
|
| 38 |
+
"tokenizer.STLTokenizer",
|
| 39 |
+
null
|
| 40 |
+
]
|
| 41 |
+
},
|
| 42 |
+
"bos_token": "/s",
|
| 43 |
+
"clean_up_tokenization_spaces": false,
|
| 44 |
+
"eos_token": "s",
|
| 45 |
+
"extra_special_tokens": {},
|
| 46 |
+
"model_max_length": 512,
|
| 47 |
+
"pad_token": "pad",
|
| 48 |
+
"tokenizer_class": "STLTokenizer",
|
| 49 |
+
"unk_token": "unk"
|
| 50 |
+
}
|
vocab.json
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"unk": 0,
|
| 3 |
+
"pad": 1,
|
| 4 |
+
"/s": 2,
|
| 5 |
+
"s": 3,
|
| 6 |
+
"(": 4,
|
| 7 |
+
")": 5,
|
| 8 |
+
"always": 6,
|
| 9 |
+
"eventually": 7,
|
| 10 |
+
"until": 8,
|
| 11 |
+
"and": 9,
|
| 12 |
+
"or": 10,
|
| 13 |
+
"not": 11,
|
| 14 |
+
">=": 12,
|
| 15 |
+
"<=": 13,
|
| 16 |
+
">": 14,
|
| 17 |
+
"<": 15,
|
| 18 |
+
"=": 16,
|
| 19 |
+
"x_": 17,
|
| 20 |
+
"[": 18,
|
| 21 |
+
"]": 19,
|
| 22 |
+
",": 20,
|
| 23 |
+
"inf": 21,
|
| 24 |
+
"-": 22,
|
| 25 |
+
".": 23,
|
| 26 |
+
"0": 24,
|
| 27 |
+
"1": 25,
|
| 28 |
+
"2": 26,
|
| 29 |
+
"3": 27,
|
| 30 |
+
"4": 28,
|
| 31 |
+
"5": 29,
|
| 32 |
+
"6": 30,
|
| 33 |
+
"7": 31,
|
| 34 |
+
"8": 32,
|
| 35 |
+
"9": 33,
|
| 36 |
+
"@": 34
|
| 37 |
+
}
|