saracandu
/

stldec_random_16_large

@@ -10,10 +10,9 @@ logger = logging.get_logger(__name__)
 def load_json(path: str) -> Union[Dict, List]:
     """
     Load a JSON file from the given path.
     Args:
         path (str): The path to the JSON file to be loaded.
     Returns:
         Union[Dict, List]: The parsed content of the JSON file, which could be a dictionary or a list.
     """
@@ -24,16 +23,14 @@ def load_json(path: str) -> Union[Dict, List]:
 class STLTokenizer(PreTrainedTokenizer):
     """
     A custom tokenizer class that extends `PreTrainedTokenizer` to handle a specific vocabulary and tokenization process.
-    This tokenizer can load a vocabulary from a JSON file, tokenize text, convert tokens to IDs,
     and handle padding and special tokens.
     """
-    def __init__(self, vocab_path: str = 'vocab.json', unk_token: str = "unk", pad_token: str = "pad",
-                 bos_token: str = "/s", eos_token: str = "s", model_max_length = 512):
         """
         Initializes the STLTokenizer with a given vocabulary and special tokens.
         Args:
             vocab_path (str): The path to the JSON file containing the vocabulary.
             unk_token (str, optional): The token used for unknown words. Defaults to "unk".
@@ -49,11 +46,19 @@ class STLTokenizer(PreTrainedTokenizer):
         self.model_max_length = model_max_length
         self.id_to_token = {v: k for k, v in self.vocab.items()}  # Reverse mapping
     @property
     def vocab_size(self) -> int:
         """
         Returns the size of the vocabulary.
         Returns:
             int: The number of tokens in the vocabulary.
         """
@@ -62,11 +67,9 @@ class STLTokenizer(PreTrainedTokenizer):
     def prepad_sequence(self, sequence, space_token = ' ', new_space_token = '@', undo = False):
         """
         Replaces spaces in the input sequence with a specified token.
         Args:
             sequence (str): The input sequence.
             undo (bool): If True, replace the padding token with spaces. Defaults to False, which pads the spaces.
         Returns:
             str: The preprocessed sequence with spaces or padding tokens replaced.
         """
@@ -78,10 +81,8 @@ class STLTokenizer(PreTrainedTokenizer):
     def add_bos_eos(self, sequence: str) -> str:
         """
         Aggiunge i token BOS all'inizio e EOS alla fine della sequenza.
         Args:
             sequence (str): La sequenza di input.
         Returns:
             str: La sequenza con i token BOS ed EOS.
         """
@@ -90,19 +91,16 @@ class STLTokenizer(PreTrainedTokenizer):
     def tokenize(self, text: str) -> List[str]:
         """
         Tokenizes the input text into a list of tokens.
-        The method preprocesses the input text by replacing spaces with padding tokens and then tries to
         find the longest possible match for each substring in the vocabulary.
         Args:
             text (str): The input text to be tokenized.
         Returns:
             List[str]: A list of tokens representing the tokenized text.
         """
         text = self.add_bos_eos(text)
         text = self.prepad_sequence(text)
         tokens = []
         i = 0
         while i < len(text):
@@ -123,10 +121,8 @@ class STLTokenizer(PreTrainedTokenizer):
     def convert_tokens_to_ids(self, tokens: List[str]) -> List[int]:
         """
         Converts a list of tokens into a list of token IDs.
         Args:
             tokens (List[str]): A list of tokens to be converted into IDs.
         Returns:
             List[int]: A list of corresponding token IDs.
         """
@@ -135,10 +131,8 @@ class STLTokenizer(PreTrainedTokenizer):
     def convert_ids_to_tokens(self, ids: List[int]) -> List[str]:
         """
         Converts a list of token IDs into a list of tokens.
         Args:
             ids (List[int]): A list of token IDs to be converted into tokens.
         Returns:
             List[str]: A list of corresponding tokens.
         """
@@ -147,14 +141,14 @@ class STLTokenizer(PreTrainedTokenizer):
     def encode(self, sequence: str) -> List[int]:
         """
         Encodes a string sequence into a list of token IDs.
-        This method tokenizes the input sequence using the `tokenize` method,
-        and then converts the resulting tokens into their corresponding token IDs
         using the `convert_tokens_to_ids` method.
         Args:
             sequence (str): The input sequence (text) to be encoded.
         Returns:
             List[int]: A list of token IDs corresponding to the input sequence.
         """
@@ -163,8 +157,8 @@ class STLTokenizer(PreTrainedTokenizer):
     def postpad_sequence(self, sequence, pad_token_id):
        """
-       Fills the sequence up to max_length padding elements
-       """
        num_extra_elements = self.model_max_length - len(sequence) -1
        if num_extra_elements > 0:
            sequence.extend([pad_token_id] * num_extra_elements)
@@ -173,14 +167,11 @@ class STLTokenizer(PreTrainedTokenizer):
     def decode(self, token_ids: List[int]) -> str:
         """
         Decodes a list of token IDs into a string of text.
-        The method converts the IDs to tokens and joins them to form a string.
         It also restores the original spaces or padding tokens if `undo` is True.
         Args:
             token_ids (List[int]): A list of token IDs to be decoded.
             skip_special_tokens (bool, optional): Whether to skip special tokens during decoding. Defaults to False.
         Returns:
             str: The decoded string.
         """
@@ -190,16 +181,13 @@ class STLTokenizer(PreTrainedTokenizer):
     def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
         """
-        Saves the tokenizer's vocabulary to a file.
-        Useful only when the vocabulary has to be retrieved and is not given
         (thus this is not the case: here to further improvements with sentencepiece).
-        This method saves the vocabulary to a JSON file in the specified directory.
         Args:
             save_directory (str): The directory where the vocabulary file will be saved.
             filename_prefix (Optional[str]): An optional prefix for the filename.
         Returns:
             Tuple[str]: A tuple containing the path to the saved vocabulary file.
         """
@@ -211,7 +199,6 @@ class STLTokenizer(PreTrainedTokenizer):
     def get_vocab(self) -> dict:
         """
         Retrieves the vocabulary used by the tokenizer.
         Returns:
             dict: The vocabulary as a dictionary.
         """

 def load_json(path: str) -> Union[Dict, List]:
     """
     Load a JSON file from the given path.
     Args:
         path (str): The path to the JSON file to be loaded.
     Returns:
         Union[Dict, List]: The parsed content of the JSON file, which could be a dictionary or a list.
     """
 class STLTokenizer(PreTrainedTokenizer):
     """
     A custom tokenizer class that extends `PreTrainedTokenizer` to handle a specific vocabulary and tokenization process.
+    This tokenizer can load a vocabulary from a JSON file, tokenize text, convert tokens to IDs,
     and handle padding and special tokens.
     """
+    def __init__(self, vocab_path: str = 'vocab.json', unk_token: str = "unk", pad_token: str = "pad",
+                 bos_token: str = "/s", eos_token: str = "s", model_max_length = 512, **kwargs):
         """
         Initializes the STLTokenizer with a given vocabulary and special tokens.
         Args:
             vocab_path (str): The path to the JSON file containing the vocabulary.
             unk_token (str, optional): The token used for unknown words. Defaults to "unk".
         self.model_max_length = model_max_length
         self.id_to_token = {v: k for k, v in self.vocab.items()}  # Reverse mapping
+        super().__init__(
+            unk_token=unk_token,
+            pad_token=pad_token,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            model_max_length=model_max_length,
+            **kwargs
+        )
     @property
     def vocab_size(self) -> int:
         """
         Returns the size of the vocabulary.
         Returns:
             int: The number of tokens in the vocabulary.
         """
     def prepad_sequence(self, sequence, space_token = ' ', new_space_token = '@', undo = False):
         """
         Replaces spaces in the input sequence with a specified token.
         Args:
             sequence (str): The input sequence.
             undo (bool): If True, replace the padding token with spaces. Defaults to False, which pads the spaces.
         Returns:
             str: The preprocessed sequence with spaces or padding tokens replaced.
         """
     def add_bos_eos(self, sequence: str) -> str:
         """
         Aggiunge i token BOS all'inizio e EOS alla fine della sequenza.
         Args:
             sequence (str): La sequenza di input.
         Returns:
             str: La sequenza con i token BOS ed EOS.
         """
     def tokenize(self, text: str) -> List[str]:
         """
         Tokenizes the input text into a list of tokens.
+        The method preprocesses the input text by replacing spaces with padding tokens and then tries to
         find the longest possible match for each substring in the vocabulary.
         Args:
             text (str): The input text to be tokenized.
         Returns:
             List[str]: A list of tokens representing the tokenized text.
         """
         text = self.add_bos_eos(text)
         text = self.prepad_sequence(text)
         tokens = []
         i = 0
         while i < len(text):
     def convert_tokens_to_ids(self, tokens: List[str]) -> List[int]:
         """
         Converts a list of tokens into a list of token IDs.
         Args:
             tokens (List[str]): A list of tokens to be converted into IDs.
         Returns:
             List[int]: A list of corresponding token IDs.
         """
     def convert_ids_to_tokens(self, ids: List[int]) -> List[str]:
         """
         Converts a list of token IDs into a list of tokens.
         Args:
             ids (List[int]): A list of token IDs to be converted into tokens.
         Returns:
             List[str]: A list of corresponding tokens.
         """
     def encode(self, sequence: str) -> List[int]:
         """
         Encodes a string sequence into a list of token IDs.
+        This method tokenizes the input sequence using the `tokenize` method,
+        and then converts the resulting tokens into their corresponding token IDs
         using the `convert_tokens_to_ids` method.
         Args:
             sequence (str): The input sequence (text) to be encoded.
         Returns:
             List[int]: A list of token IDs corresponding to the input sequence.
         """
     def postpad_sequence(self, sequence, pad_token_id):
        """
+       Fills the sequence up to max_length padding elements
+       """
        num_extra_elements = self.model_max_length - len(sequence) -1
        if num_extra_elements > 0:
            sequence.extend([pad_token_id] * num_extra_elements)
     def decode(self, token_ids: List[int]) -> str:
         """
         Decodes a list of token IDs into a string of text.
+        The method converts the IDs to tokens and joins them to form a string.
         It also restores the original spaces or padding tokens if `undo` is True.
         Args:
             token_ids (List[int]): A list of token IDs to be decoded.
             skip_special_tokens (bool, optional): Whether to skip special tokens during decoding. Defaults to False.
         Returns:
             str: The decoded string.
         """
     def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
         """
+        Saves the tokenizer's vocabulary to a file.
+        Useful only when the vocabulary has to be retrieved and is not given
         (thus this is not the case: here to further improvements with sentencepiece).
+        This method saves the vocabulary to a JSON file in the specified directory.
         Args:
             save_directory (str): The directory where the vocabulary file will be saved.
             filename_prefix (Optional[str]): An optional prefix for the filename.
         Returns:
             Tuple[str]: A tuple containing the path to the saved vocabulary file.
         """
     def get_vocab(self) -> dict:
         """
         Retrieves the vocabulary used by the tokenizer.
         Returns:
             dict: The vocabulary as a dictionary.
         """