|
|
import re |
|
|
import os |
|
|
from typing import Any, Dict, List, Optional, Tuple, Union |
|
|
from transformers import PreTrainedTokenizer |
|
|
from transformers.utils import logging |
|
|
from pathlib import Path |
|
|
import json |
|
|
|
|
|
logger = logging.get_logger(__name__) |
|
|
|
|
|
from huggingface_hub import hf_hub_download |
|
|
import json |
|
|
import os |
|
|
|
|
|
def load_json(path, repo_id=None): |
|
|
if repo_id: |
|
|
path = hf_hub_download(repo_id, path) |
|
|
with open(path, "r", encoding="utf-8") as f: |
|
|
return json.load(f) |
|
|
|
|
|
def load_json_old(path: str) -> Union[Dict, List]: |
|
|
""" |
|
|
Load a JSON file from the given path. |
|
|
Args: |
|
|
path (str): The path to the JSON file to be loaded. |
|
|
|
|
|
Returns: |
|
|
Union[Dict, List]: The parsed content of the JSON file, which could be a dictionary or a list. |
|
|
""" |
|
|
full_path = Path(__file__).parent / path |
|
|
with open(full_path, "r", encoding="utf-8") as f: |
|
|
return json.load(f) |
|
|
|
|
|
|
|
|
class STLTokenizer(PreTrainedTokenizer): |
|
|
""" |
|
|
A custom tokenizer class that extends `PreTrainedTokenizer` to handle a specific vocabulary and tokenization process. |
|
|
This tokenizer can load a vocabulary from a JSON file, tokenize text, convert tokens to IDs, |
|
|
and handle padding and special tokens. |
|
|
""" |
|
|
def __init__(self, vocab_path: str = "vocab.json", unk_token: str = "unk", pad_token: str = "pad", |
|
|
bos_token: str = "/s", eos_token: str = "s", model_max_length = 512, **kwargs): |
|
|
""" |
|
|
Initializes the STLTokenizer with a given vocabulary and special tokens. |
|
|
Args: |
|
|
vocab_path (str): The path to the JSON file containing the vocabulary. |
|
|
unk_token (str, optional): The token used for unknown words. Defaults to "unk". |
|
|
pad_token (str, optional): The token used for padding. Defaults to "pad". |
|
|
bos_token (str, optional): The token used for the beginning of a sequence. Defaults to "/s". |
|
|
eos_token (str, optional): The token used for the end of a sequence. Defaults to "s". |
|
|
""" |
|
|
self.vocab = load_json("vocab.json", repo_id="saracandu/stldec_random_32") |
|
|
self.unk_token = unk_token |
|
|
self.pad_token = pad_token |
|
|
self.bos_token = bos_token |
|
|
self.eos_token = eos_token |
|
|
self.model_max_length = model_max_length |
|
|
self.id_to_token = {v: k for k, v in self.vocab.items()} |
|
|
|
|
|
super().__init__( |
|
|
unk_token=unk_token, |
|
|
pad_token=pad_token, |
|
|
bos_token=bos_token, |
|
|
eos_token=eos_token, |
|
|
model_max_length=model_max_length, |
|
|
**kwargs |
|
|
) |
|
|
|
|
|
@property |
|
|
def vocab_size(self) -> int: |
|
|
""" |
|
|
Returns the size of the vocabulary. |
|
|
Returns: |
|
|
int: The number of tokens in the vocabulary. |
|
|
""" |
|
|
return len(self.vocab) |
|
|
|
|
|
def prepad_sequence(self, sequence, space_token = ' ', new_space_token = '@', undo = False): |
|
|
""" |
|
|
Replaces spaces in the input sequence with a specified token. |
|
|
Args: |
|
|
sequence (str): The input sequence. |
|
|
undo (bool): If True, replace the padding token with spaces. Defaults to False, which pads the spaces. |
|
|
Returns: |
|
|
str: The preprocessed sequence with spaces or padding tokens replaced. |
|
|
""" |
|
|
if undo: |
|
|
return sequence.replace(new_space_token, space_token) |
|
|
else: |
|
|
return sequence.replace(space_token, new_space_token) |
|
|
|
|
|
def add_bos_eos(self, sequence: str) -> str: |
|
|
""" |
|
|
Aggiunge i token BOS all'inizio e EOS alla fine della sequenza. |
|
|
Args: |
|
|
sequence (str): La sequenza di input. |
|
|
Returns: |
|
|
str: La sequenza con i token BOS ed EOS. |
|
|
""" |
|
|
return f'{self.bos_token} {sequence} {self.eos_token}' |
|
|
|
|
|
def tokenize(self, text: str) -> List[str]: |
|
|
""" |
|
|
Tokenizes the input text into a list of tokens. |
|
|
The method preprocesses the input text by replacing spaces with padding tokens and then tries to |
|
|
find the longest possible match for each substring in the vocabulary. |
|
|
Args: |
|
|
text (str): The input text to be tokenized. |
|
|
Returns: |
|
|
List[str]: A list of tokens representing the tokenized text. |
|
|
""" |
|
|
text = self.add_bos_eos(text) |
|
|
text = self.prepad_sequence(text) |
|
|
|
|
|
tokens = [] |
|
|
i = 0 |
|
|
while i < len(text): |
|
|
best_match = None |
|
|
for j in range(len(text), i, -1): |
|
|
subtoken = text[i:j] |
|
|
if subtoken in self.vocab: |
|
|
best_match = subtoken |
|
|
break |
|
|
if best_match: |
|
|
tokens.append(best_match) |
|
|
i += len(best_match) |
|
|
else: |
|
|
tokens.append(self.unk_token) |
|
|
i += 1 |
|
|
return tokens |
|
|
|
|
|
def convert_tokens_to_ids(self, tokens: List[str]) -> List[int]: |
|
|
""" |
|
|
Converts a list of tokens into a list of token IDs. |
|
|
Args: |
|
|
tokens (List[str]): A list of tokens to be converted into IDs. |
|
|
Returns: |
|
|
List[int]: A list of corresponding token IDs. |
|
|
""" |
|
|
unk_token_str = str(self.unk_token) |
|
|
unk_token_id = self.vocab.get(unk_token_str) |
|
|
return [self.vocab.get(token, unk_token_id) for token in tokens] |
|
|
|
|
|
def convert_ids_to_tokens(self, ids: List[int]) -> List[str]: |
|
|
""" |
|
|
Converts a list of token IDs into a list of tokens. |
|
|
Args: |
|
|
ids (List[int]): A list of token IDs to be converted into tokens. |
|
|
Returns: |
|
|
List[str]: A list of corresponding tokens. |
|
|
""" |
|
|
return [self.id_to_token.get(i, self.unk_token) for i in ids] |
|
|
|
|
|
def encode(self, sequence: str) -> List[int]: |
|
|
""" |
|
|
Encodes a string sequence into a list of token IDs. |
|
|
|
|
|
This method tokenizes the input sequence using the `tokenize` method, |
|
|
and then converts the resulting tokens into their corresponding token IDs |
|
|
using the `convert_tokens_to_ids` method. |
|
|
|
|
|
Args: |
|
|
sequence (str): The input sequence (text) to be encoded. |
|
|
|
|
|
Returns: |
|
|
List[int]: A list of token IDs corresponding to the input sequence. |
|
|
""" |
|
|
splitted_sequence = self.tokenize(sequence) |
|
|
return self.convert_tokens_to_ids(splitted_sequence) |
|
|
|
|
|
def postpad_sequence(self, sequence, pad_token_id): |
|
|
""" |
|
|
Fills the sequence up to max_length padding elements |
|
|
""" |
|
|
num_extra_elements = self.model_max_length - len(sequence) -1 |
|
|
if num_extra_elements > 0: |
|
|
sequence.extend([pad_token_id] * num_extra_elements) |
|
|
return sequence |
|
|
|
|
|
def decode(self, token_ids: List[int]) -> str: |
|
|
""" |
|
|
Decodes a list of token IDs into a string of text. |
|
|
The method converts the IDs to tokens and joins them to form a string. |
|
|
It also restores the original spaces or padding tokens if `undo` is True. |
|
|
Args: |
|
|
token_ids (List[int]): A list of token IDs to be decoded. |
|
|
skip_special_tokens (bool, optional): Whether to skip special tokens during decoding. Defaults to False. |
|
|
Returns: |
|
|
str: The decoded string. |
|
|
""" |
|
|
tokens = self.convert_ids_to_tokens(token_ids) |
|
|
decoded = "".join(tokens) |
|
|
return self.prepad_sequence(decoded, undo=True) |
|
|
|
|
|
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: |
|
|
""" |
|
|
Saves the tokenizer's vocabulary to a file. |
|
|
Useful only when the vocabulary has to be retrieved and is not given |
|
|
(thus this is not the case: here to further improvements with sentencepiece). |
|
|
This method saves the vocabulary to a JSON file in the specified directory. |
|
|
Args: |
|
|
save_directory (str): The directory where the vocabulary file will be saved. |
|
|
filename_prefix (Optional[str]): An optional prefix for the filename. |
|
|
Returns: |
|
|
Tuple[str]: A tuple containing the path to the saved vocabulary file. |
|
|
""" |
|
|
vocab_file = f"{save_directory}/{filename_prefix + '-' if filename_prefix else ''}vocab.json" |
|
|
with open(vocab_file, "w", encoding="utf-8") as f: |
|
|
json.dump(self.vocab, f, indent=2, ensure_ascii=False) |
|
|
return (vocab_file,) |
|
|
|
|
|
def get_vocab(self) -> dict: |
|
|
""" |
|
|
Retrieves the vocabulary used by the tokenizer. |
|
|
Returns: |
|
|
dict: The vocabulary as a dictionary. |
|
|
""" |
|
|
return self.vocab |
|
|
|