Spaces:
Running
Running
| import collections | |
| import os | |
| import re | |
| from typing import List, Optional | |
| from transformers import PreTrainedTokenizer | |
| from SmilesPE.tokenizer import SPE_Tokenizer | |
| import torch | |
| def load_vocab(vocab_file): | |
| """Loads a vocabulary file into a dictionary.""" | |
| vocab = collections.OrderedDict() | |
| with open(vocab_file, "r", encoding="utf-8") as reader: | |
| tokens = reader.readlines() | |
| for index, token in enumerate(tokens): | |
| token = token.rstrip("\n") | |
| vocab[token] = index | |
| return vocab | |
| class Atomwise_Tokenizer(object): | |
| """Run atom-level SMILES tokenization""" | |
| def __init__(self): | |
| """ Constructs a atom-level Tokenizer. | |
| """ | |
| # self.regex_pattern = r"(\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\|\/|:|~|@|\?|>>?|\*|\$|\%[0-9]{2}|[0-9])" | |
| self.regex_pattern = r"(\([^\(\)]{0,4}\)|\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\|\/\/?|:|~|@|\?|>>?|\*|\$|\%[0-9]{2}|[0-9])" | |
| self.regex = re.compile(self.regex_pattern) | |
| def tokenize(self, text): | |
| """ Basic Tokenization of a SMILES. | |
| """ | |
| tokens = [token for token in self.regex.findall(text)] | |
| return tokens | |
| class SMILES_SPE_Tokenizer(PreTrainedTokenizer): | |
| r""" | |
| Constructs a SMILES tokenizer. Based on SMILES Pair Encoding (https://github.com/XinhaoLi74/SmilesPE). | |
| This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users | |
| should refer to the superclass for more information regarding methods. | |
| Args: | |
| vocab_file (:obj:`string`): | |
| File containing the vocabulary. | |
| spe_file (:obj:`string`): | |
| File containing the trained SMILES Pair Encoding vocabulary. | |
| unk_token (:obj:`string`, `optional`, defaults to "[UNK]"): | |
| The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this | |
| token instead. | |
| sep_token (:obj:`string`, `optional`, defaults to "[SEP]"): | |
| The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences | |
| for sequence classification or for a text and a question for question answering. | |
| It is also used as the last token of a sequence built with special tokens. | |
| pad_token (:obj:`string`, `optional`, defaults to "[PAD]"): | |
| The token used for padding, for example when batching sequences of different lengths. | |
| cls_token (:obj:`string`, `optional`, defaults to "[CLS]"): | |
| The classifier token which is used when doing sequence classification (classification of the whole | |
| sequence instead of per-token classification). It is the first token of the sequence when built with | |
| special tokens. | |
| mask_token (:obj:`string`, `optional`, defaults to "[MASK]"): | |
| The token used for masking values. This is the token used when training this model with masked language | |
| modeling. This is the token which the model will try to predict. | |
| """ | |
| def __init__(self, vocab_file, spe_file, | |
| unk_token="[UNK]", | |
| sep_token="[SEP]", | |
| pad_token="[PAD]", | |
| cls_token="[CLS]", | |
| mask_token="[MASK]", | |
| **kwargs): | |
| if not os.path.isfile(vocab_file): | |
| raise ValueError("Can't find a vocabulary file at path '{}'.".format(vocab_file)) | |
| if not os.path.isfile(spe_file): | |
| raise ValueError("Can't find a SPE vocabulary file at path '{}'.".format(spe_file)) | |
| self.vocab = load_vocab(vocab_file) | |
| self.spe_vocab = open(spe_file, 'r', encoding='utf-8') | |
| self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()]) | |
| self.spe_tokenizer = SPE_Tokenizer(self.spe_vocab) | |
| super().__init__( | |
| unk_token=unk_token, | |
| sep_token=sep_token, | |
| pad_token=pad_token, | |
| cls_token=cls_token, | |
| mask_token=mask_token, | |
| **kwargs) | |
| def vocab_size(self): | |
| return len(self.vocab) | |
| def get_vocab(self): | |
| return dict(self.vocab, **self.added_tokens_encoder) | |
| def _tokenize(self, text): | |
| return self.spe_tokenizer.tokenize(text).split(' ') | |
| def _convert_token_to_id(self, token): | |
| """ Converts a token (str) in an id using the vocab. """ | |
| return self.vocab.get(token, self.vocab.get(self.unk_token)) | |
| # changed encode and decode functions | |
| def encode(self, token_array): | |
| token_ids = [] | |
| token_ids.append(2) | |
| for token in token_array: | |
| id = self._convert_token_to_id(token) | |
| token_ids.append(id) | |
| token_ids.append(3) | |
| token_ids = torch.tensor([token_ids]) | |
| attn_mask = torch.ones_like(token_ids) | |
| return {'input_ids': token_ids, 'attention_mask': attn_mask} | |
| def decode(self, token_ids, skip_special_tokens=True): | |
| token_ids = token_ids.squeeze(0).cpu().tolist() | |
| token_array = [] | |
| for idx in token_ids: | |
| if idx == 3: # Stop decoding when token ID 3 is encountered | |
| break | |
| if skip_special_tokens and idx in self.all_special_ids: | |
| continue | |
| token = self._convert_id_to_token(idx) | |
| token_array.append(token) | |
| sequence = "".join(token_array) | |
| return sequence | |
| def batch_decode(self, batch_token_ids, skip_special_tokens=True): | |
| sequences = [] | |
| for token_ids in batch_token_ids: | |
| sequences.append(self.decode(token_ids)) | |
| return sequences | |
| def get_token_split(self, token_ids): | |
| if isinstance(token_ids, torch.Tensor): | |
| token_ids = token_ids.cpu().tolist() | |
| token_array = [] | |
| for seq_ids in token_ids: | |
| seq_array = [] | |
| for id in seq_ids: | |
| token = self._convert_id_to_token(id) | |
| seq_array.append(token) | |
| token_array.append(seq_array) | |
| return token_array | |
| def _convert_id_to_token(self, index): | |
| """Converts an index (integer) in a token (str) using the vocab.""" | |
| return self.ids_to_tokens.get(index, self.unk_token) | |
| def convert_tokens_to_string(self, tokens): | |
| """ Converts a sequence of tokens (string) in a single string. """ | |
| out_string = " ".join(tokens).replace(" ##", "").strip() | |
| return out_string | |
| def build_inputs_with_special_tokens( | |
| self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None | |
| ) -> List[int]: | |
| """ | |
| Build model inputs from a sequence or a pair of sequence for sequence classification tasks | |
| by concatenating and adding special tokens. | |
| A BERT sequence has the following format: | |
| - single sequence: ``[CLS] X [SEP]`` | |
| - pair of sequences: ``[CLS] A [SEP] B [SEP]`` | |
| Args: | |
| token_ids_0 (:obj:`List[int]`): | |
| List of IDs to which the special tokens will be added | |
| token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`): | |
| Optional second list of IDs for sequence pairs. | |
| Returns: | |
| :obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens. | |
| """ | |
| if token_ids_1 is None: | |
| return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] | |
| cls = [self.cls_token_id] | |
| sep = [self.sep_token_id] | |
| return cls + token_ids_0 + sep + token_ids_1 + sep | |
| def get_special_tokens_mask( | |
| self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False | |
| ) -> List[int]: | |
| """ | |
| Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding | |
| special tokens using the tokenizer ``prepare_for_model`` method. | |
| Args: | |
| token_ids_0 (:obj:`List[int]`): | |
| List of ids. | |
| token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`): | |
| Optional second list of IDs for sequence pairs. | |
| already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`): | |
| Set to True if the token list is already formatted with special tokens for the model | |
| Returns: | |
| :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. | |
| """ | |
| if already_has_special_tokens: | |
| if token_ids_1 is not None: | |
| raise ValueError( | |
| "You should not supply a second sequence if the provided sequence of " | |
| "ids is already formated with special tokens for the model." | |
| ) | |
| return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0)) | |
| if token_ids_1 is not None: | |
| return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1] | |
| return [1] + ([0] * len(token_ids_0)) + [1] | |
| def create_token_type_ids_from_sequences( | |
| self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None | |
| ) -> List[int]: | |
| """ | |
| Creates a mask from the two sequences passed to be used in a sequence-pair classification task. | |
| A BERT sequence pair mask has the following format: | |
| :: | |
| 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 | |
| | first sequence | second sequence | | |
| if token_ids_1 is None, only returns the first portion of the mask (0's). | |
| Args: | |
| token_ids_0 (:obj:`List[int]`): | |
| List of ids. | |
| token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`): | |
| Optional second list of IDs for sequence pairs. | |
| Returns: | |
| :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given | |
| sequence(s). | |
| """ | |
| sep = [self.sep_token_id] | |
| cls = [self.cls_token_id] | |
| if token_ids_1 is None: | |
| return len(cls + token_ids_0 + sep) * [0] | |
| return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] | |
| def save_vocabulary(self, vocab_path): | |
| """ | |
| Save the sentencepiece vocabulary (copy original file) and special tokens file to a directory. | |
| Args: | |
| vocab_path (:obj:`str`): | |
| The directory in which to save the vocabulary. | |
| Returns: | |
| :obj:`Tuple(str)`: Paths to the files saved. | |
| """ | |
| index = 0 | |
| vocab_file = vocab_path | |
| with open(vocab_file, "w", encoding="utf-8") as writer: | |
| for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]): | |
| if index != token_index: | |
| index = token_index | |
| writer.write(token + "\n") | |
| index += 1 | |
| return (vocab_file,) | |
| class SMILES_Atomwise_Tokenizer(PreTrainedTokenizer): | |
| r""" | |
| Constructs a SMILES tokenizer. Based on SMILES Pair Encoding (https://github.com/XinhaoLi74/SmilesPE). | |
| This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users | |
| should refer to the superclass for more information regarding methods. | |
| Args: | |
| vocab_file (:obj:`string`): | |
| File containing the vocabulary. | |
| unk_token (:obj:`string`, `optional`, defaults to "[UNK]"): | |
| The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this | |
| token instead. | |
| sep_token (:obj:`string`, `optional`, defaults to "[SEP]"): | |
| The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences | |
| for sequence classification or for a text and a question for question answering. | |
| It is also used as the last token of a sequence built with special tokens. | |
| pad_token (:obj:`string`, `optional`, defaults to "[PAD]"): | |
| The token used for padding, for example when batching sequences of different lengths. | |
| cls_token (:obj:`string`, `optional`, defaults to "[CLS]"): | |
| The classifier token which is used when doing sequence classification (classification of the whole | |
| sequence instead of per-token classification). It is the first token of the sequence when built with | |
| special tokens. | |
| mask_token (:obj:`string`, `optional`, defaults to "[MASK]"): | |
| The token used for masking values. This is the token used when training this model with masked language | |
| modeling. This is the token which the model will try to predict. | |
| """ | |
| def __init__( | |
| self, | |
| vocab_file, | |
| unk_token="[UNK]", | |
| sep_token="[SEP]", | |
| pad_token="[PAD]", | |
| cls_token="[CLS]", | |
| mask_token="[MASK]", | |
| **kwargs | |
| ): | |
| super().__init__( | |
| unk_token=unk_token, | |
| sep_token=sep_token, | |
| pad_token=pad_token, | |
| cls_token=cls_token, | |
| mask_token=mask_token, | |
| **kwargs, | |
| ) | |
| if not os.path.isfile(vocab_file): | |
| raise ValueError( | |
| "Can't find a vocabulary file at path '{}'.".format(vocab_file) | |
| ) | |
| self.vocab = load_vocab(vocab_file) | |
| self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()]) | |
| self.tokenizer = Atomwise_Tokenizer() | |
| def vocab_size(self): | |
| return len(self.vocab) | |
| def get_vocab(self): | |
| return dict(self.vocab, **self.added_tokens_encoder) | |
| def _tokenize(self, text): | |
| return self.tokenizer.tokenize(text) | |
| def _convert_token_to_id(self, token): | |
| """ Converts a token (str) in an id using the vocab. """ | |
| return self.vocab.get(token, self.vocab.get(self.unk_token)) | |
| def _convert_id_to_token(self, index): | |
| """Converts an index (integer) in a token (str) using the vocab.""" | |
| return self.ids_to_tokens.get(index, self.unk_token) | |
| def convert_tokens_to_string(self, tokens): | |
| """ Converts a sequence of tokens (string) in a single string. """ | |
| out_string = " ".join(tokens).replace(" ##", "").strip() | |
| return out_string | |
| def build_inputs_with_special_tokens( | |
| self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None | |
| ) -> List[int]: | |
| """ | |
| Build model inputs from a sequence or a pair of sequence for sequence classification tasks | |
| by concatenating and adding special tokens. | |
| A BERT sequence has the following format: | |
| - single sequence: ``[CLS] X [SEP]`` | |
| - pair of sequences: ``[CLS] A [SEP] B [SEP]`` | |
| Args: | |
| token_ids_0 (:obj:`List[int]`): | |
| List of IDs to which the special tokens will be added | |
| token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`): | |
| Optional second list of IDs for sequence pairs. | |
| Returns: | |
| :obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens. | |
| """ | |
| if token_ids_1 is None: | |
| return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] | |
| cls = [self.cls_token_id] | |
| sep = [self.sep_token_id] | |
| return cls + token_ids_0 + sep + token_ids_1 + sep | |
| def get_special_tokens_mask( | |
| self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False | |
| ) -> List[int]: | |
| """ | |
| Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding | |
| special tokens using the tokenizer ``prepare_for_model`` method. | |
| Args: | |
| token_ids_0 (:obj:`List[int]`): | |
| List of ids. | |
| token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`): | |
| Optional second list of IDs for sequence pairs. | |
| already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`): | |
| Set to True if the token list is already formatted with special tokens for the model | |
| Returns: | |
| :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. | |
| """ | |
| if already_has_special_tokens: | |
| if token_ids_1 is not None: | |
| raise ValueError( | |
| "You should not supply a second sequence if the provided sequence of " | |
| "ids is already formated with special tokens for the model." | |
| ) | |
| return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0)) | |
| if token_ids_1 is not None: | |
| return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1] | |
| return [1] + ([0] * len(token_ids_0)) + [1] | |
| def create_token_type_ids_from_sequences( | |
| self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None | |
| ) -> List[int]: | |
| """ | |
| Creates a mask from the two sequences passed to be used in a sequence-pair classification task. | |
| A BERT sequence pair mask has the following format: | |
| :: | |
| 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 | |
| | first sequence | second sequence | | |
| if token_ids_1 is None, only returns the first portion of the mask (0's). | |
| Args: | |
| token_ids_0 (:obj:`List[int]`): | |
| List of ids. | |
| token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`): | |
| Optional second list of IDs for sequence pairs. | |
| Returns: | |
| :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given | |
| sequence(s). | |
| """ | |
| sep = [self.sep_token_id] | |
| cls = [self.cls_token_id] | |
| if token_ids_1 is None: | |
| return len(cls + token_ids_0 + sep) * [0] | |
| return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] | |
| def save_vocabulary(self, vocab_path): | |
| """ | |
| Save the sentencepiece vocabulary (copy original file) and special tokens file to a directory. | |
| Args: | |
| vocab_path (:obj:`str`): | |
| The directory in which to save the vocabulary. | |
| Returns: | |
| :obj:`Tuple(str)`: Paths to the files saved. | |
| """ | |
| index = 0 | |
| vocab_file = vocab_path | |
| with open(vocab_file, "w", encoding="utf-8") as writer: | |
| for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]): | |
| if index != token_index: | |
| index = token_index | |
| writer.write(token + "\n") | |
| index += 1 | |
| return (vocab_file,) | |