saracandu
/

stldec_random

Safetensors

stldec1024

custom_code

Model card Files Files and versions

xet

Community

saracandu commited on Jul 31, 2025

Commit

5bcd847

verified ·

1 Parent(s): 2a59121

Delete handcoded_tokenizer.py

Browse files

Files changed (1) hide show

handcoded_tokenizer.py +0 -232

handcoded_tokenizer.py DELETED Viewed

@@ -1,232 +0,0 @@
-import re
-import json
-from typing import Any, Dict, List, Optional, Tuple, Union
-from transformers import PreTrainedTokenizer
-from transformers.utils import logging
-logger = logging.get_logger(__name__)
-def load_json(path: str) -> Union[Dict, List]:
-    """
-    Load a JSON file from the given path.
-    Args:
-        path (str): The path to the JSON file to be loaded.
-    Returns:
-        Union[Dict, List]: The parsed content of the JSON file, which could be a dictionary or a list.
-    """
-    with open(path, "r") as f:
-        return json.load(f)
-class STLTokenizer(PreTrainedTokenizer):
-    """
-    A custom tokenizer class that extends `PreTrainedTokenizer` to handle a specific vocabulary and tokenization process.
-    This tokenizer can load a vocabulary from a JSON file, tokenize text, convert tokens to IDs,
-    and handle padding and special tokens.
-    """
-    def __init__(self, vocab_path: str, unk_token: str = "unk", pad_token: str = "pad",
-                 bos_token: str = "/s", eos_token: str = "s", model_max_length = 512):
-        """
-        Initializes the STLTokenizer with a given vocabulary and special tokens.
-        Args:
-            vocab_path (str): The path to the JSON file containing the vocabulary.
-            unk_token (str, optional): The token used for unknown words. Defaults to "unk".
-            pad_token (str, optional): The token used for padding. Defaults to "pad".
-            bos_token (str, optional): The token used for the beginning of a sequence. Defaults to "/s".
-            eos_token (str, optional): The token used for the end of a sequence. Defaults to "s".
-        """
-        self.vocab = load_json(vocab_path)
-        self.unk_token = unk_token
-        self.pad_token = pad_token
-        self.bos_token = bos_token
-        self.eos_token = eos_token
-        self.model_max_length = model_max_length
-        self.id_to_token = {v: k for k, v in self.vocab.items()}  # Reverse mapping
-    @property
-    def vocab_size(self) -> int:
-        """
-        Returns the size of the vocabulary.
-        Returns:
-            int: The number of tokens in the vocabulary.
-        """
-        return len(self.vocab)
-    def prepad_sequence(self, sequence, space_token = ' ', new_space_token = '@', undo = False):
-        """
-        Replaces spaces in the input sequence with a specified token.
-        Args:
-            sequence (str): The input sequence.
-            undo (bool): If True, replace the padding token with spaces. Defaults to False, which pads the spaces.
-        Returns:
-            str: The preprocessed sequence with spaces or padding tokens replaced.
-        """
-        if undo:
-            return sequence.replace(new_space_token, space_token)
-        else:
-            return sequence.replace(space_token, new_space_token)
-    def add_bos_eos(self, sequence: str) -> str:
-        """
-        Aggiunge i token BOS all'inizio e EOS alla fine della sequenza.
-        Args:
-            sequence (str): La sequenza di input.
-        Returns:
-            str: La sequenza con i token BOS ed EOS.
-        """
-        return f'{self.bos_token} {sequence} {self.eos_token}'
-    def tokenize(self, text: str) -> List[str]:
-        """
-        Tokenizes the input text into a list of tokens.
-        The method preprocesses the input text by replacing spaces with padding tokens and then tries to
-        find the longest possible match for each substring in the vocabulary.
-        Args:
-            text (str): The input text to be tokenized.
-        Returns:
-            List[str]: A list of tokens representing the tokenized text.
-        """
-        text = self.add_bos_eos(text)
-        text = self.prepad_sequence(text)
-        tokens = []
-        i = 0
-        while i < len(text):
-            best_match = None
-            for j in range(len(text), i, -1):  # Try matching substrings of decreasing length
-                subtoken = text[i:j]
-                if subtoken in self.vocab:
-                    best_match = subtoken
-                    break
-            if best_match:
-                tokens.append(best_match)
-                i += len(best_match)
-            else:
-                tokens.append(self.unk_token)
-                i += 1
-        return tokens
-    def convert_tokens_to_ids(self, tokens: List[str]) -> List[int]:
-        """
-        Converts a list of tokens into a list of token IDs.
-        Args:
-            tokens (List[str]): A list of tokens to be converted into IDs.
-        Returns:
-            List[int]: A list of corresponding token IDs.
-        """
-        return [self.vocab.get(token, self.vocab[self.unk_token]) for token in tokens]
-    def convert_ids_to_tokens(self, ids: List[int]) -> List[str]:
-        """
-        Converts a list of token IDs into a list of tokens.
-        Args:
-            ids (List[int]): A list of token IDs to be converted into tokens.
-        Returns:
-            List[str]: A list of corresponding tokens.
-        """
-        return [self.id_to_token.get(i, self.unk_token) for i in ids]
-    def encode(self, sequence: str) -> List[int]:
-        """
-        Encodes a string sequence into a list of token IDs.
-        This method tokenizes the input sequence using the `tokenize` method,
-        and then converts the resulting tokens into their corresponding token IDs
-        using the `convert_tokens_to_ids` method.
-        Args:
-            sequence (str): The input sequence (text) to be encoded.
-        Returns:
-            List[int]: A list of token IDs corresponding to the input sequence.
-        """
-        splitted_sequence = self.tokenize(sequence)
-        return self.convert_tokens_to_ids(splitted_sequence)
-    def postpad_sequence(self, sequence, pad_token_id):
-       """
-       Fills the sequence up to max_length padding elements
-       """
-       num_extra_elements = self.model_max_length - len(sequence) -1
-       if num_extra_elements > 0:
-           sequence.extend([pad_token_id] * num_extra_elements)
-       return sequence
-    def decode(self, token_ids: List[int]) -> str:
-        """
-        Decodes a list of token IDs into a string of text.
-        The method converts the IDs to tokens and joins them to form a string.
-        It also restores the original spaces or padding tokens if `undo` is True.
-        Args:
-            token_ids (List[int]): A list of token IDs to be decoded.
-            skip_special_tokens (bool, optional): Whether to skip special tokens during decoding. Defaults to False.
-        Returns:
-            str: The decoded string.
-        """
-        tokens = self.convert_ids_to_tokens(token_ids)
-        decoded = "".join(tokens)
-        return self.prepad_sequence(decoded, undo=True)
-    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
-        """
-        Saves the tokenizer's vocabulary to a file.
-        Useful only when the vocabulary has to be retrieved and is not given
-        (thus this is not the case: here to further improvements with sentencepiece).
-        This method saves the vocabulary to a JSON file in the specified directory.
-        Args:
-            save_directory (str): The directory where the vocabulary file will be saved.
-            filename_prefix (Optional[str]): An optional prefix for the filename.
-        Returns:
-            Tuple[str]: A tuple containing the path to the saved vocabulary file.
-        """
-        vocab_file = f"{save_directory}/{filename_prefix + '-' if filename_prefix else ''}vocab.json"
-        with open(vocab_file, "w", encoding="utf-8") as f:
-            json.dump(self.vocab, f, indent=2, ensure_ascii=False)
-        return (vocab_file,)
-    def get_vocab(self) -> dict:
-        """
-        Retrieves the vocabulary used by the tokenizer.
-        Returns:
-            dict: The vocabulary as a dictionary.
-        """
-        return self.vocab
-# EXAMPLE OF USAGE
-# sequence = "( not ( x_1 <= 0.2988 ) until[11,21] x_0 <= -0.7941 )"
-# tokenizer = STLTokenizer('tokenizer_files/tokenizer.json')
-# token_ids = tokenizer.encode(sequence)
-# decoded_sequence = tokenizer.decode(token_ids)
-# print("Original sequence: ", sequence)
-# print("Encoded sequence: ", token_ids)
-# print("Decoded sequence: ", decoded_sequence)