| import collections |
| import os |
| import re |
| from typing import List, Optional |
| from transformers import PreTrainedTokenizer |
| from SmilesPE.tokenizer import SPE_Tokenizer |
| import torch |
|
|
| def load_vocab(vocab_file): |
| """Loads a vocabulary file into a dictionary.""" |
| vocab = collections.OrderedDict() |
| with open(vocab_file, "r", encoding="utf-8") as reader: |
| tokens = reader.readlines() |
| for index, token in enumerate(tokens): |
| token = token.rstrip("\n") |
| vocab[token] = index |
| return vocab |
|
|
| class Atomwise_Tokenizer(object): |
| """Run atom-level SMILES tokenization""" |
|
|
| def __init__(self): |
| """ Constructs a atom-level Tokenizer. |
| """ |
| |
| self.regex_pattern = r"(\([^\(\)]{0,4}\)|\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\|\/\/?|:|~|@|\?|>>?|\*|\$|\%[0-9]{2}|[0-9])" |
|
|
| self.regex = re.compile(self.regex_pattern) |
|
|
| def tokenize(self, text): |
| """ Basic Tokenization of a SMILES. |
| """ |
| tokens = [token for token in self.regex.findall(text)] |
| return tokens |
| |
| class SMILES_SPE_Tokenizer(PreTrainedTokenizer): |
| r""" |
| Constructs a SMILES tokenizer. Based on SMILES Pair Encoding (https://github.com/XinhaoLi74/SmilesPE). |
| This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users |
| should refer to the superclass for more information regarding methods. |
| Args: |
| vocab_file (:obj:`string`): |
| File containing the vocabulary. |
| spe_file (:obj:`string`): |
| File containing the trained SMILES Pair Encoding vocabulary. |
| unk_token (:obj:`string`, `optional`, defaults to "[UNK]"): |
| The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this |
| token instead. |
| sep_token (:obj:`string`, `optional`, defaults to "[SEP]"): |
| The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences |
| for sequence classification or for a text and a question for question answering. |
| It is also used as the last token of a sequence built with special tokens. |
| pad_token (:obj:`string`, `optional`, defaults to "[PAD]"): |
| The token used for padding, for example when batching sequences of different lengths. |
| cls_token (:obj:`string`, `optional`, defaults to "[CLS]"): |
| The classifier token which is used when doing sequence classification (classification of the whole |
| sequence instead of per-token classification). It is the first token of the sequence when built with |
| special tokens. |
| mask_token (:obj:`string`, `optional`, defaults to "[MASK]"): |
| The token used for masking values. This is the token used when training this model with masked language |
| modeling. This is the token which the model will try to predict. |
| """ |
|
|
| def __init__(self, vocab_file, spe_file, |
| unk_token="[UNK]", |
| sep_token="[SEP]", |
| pad_token="[PAD]", |
| cls_token="[CLS]", |
| mask_token="[MASK]", |
| **kwargs): |
| if not os.path.isfile(vocab_file): |
| raise ValueError("Can't find a vocabulary file at path '{}'.".format(vocab_file)) |
| if not os.path.isfile(spe_file): |
| raise ValueError("Can't find a SPE vocabulary file at path '{}'.".format(spe_file)) |
|
|
| self.vocab = load_vocab(vocab_file) |
| self.spe_vocab = open(spe_file, 'r', encoding='utf-8') |
| self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()]) |
| self.spe_tokenizer = SPE_Tokenizer(self.spe_vocab) |
|
|
| super().__init__( |
| unk_token=unk_token, |
| sep_token=sep_token, |
| pad_token=pad_token, |
| cls_token=cls_token, |
| mask_token=mask_token, |
| **kwargs) |
|
|
| @property |
| def vocab_size(self): |
| return len(self.vocab) |
|
|
| def get_vocab(self): |
| return dict(self.vocab, **self.added_tokens_encoder) |
|
|
| def _tokenize(self, text): |
| return self.spe_tokenizer.tokenize(text).split(' ') |
|
|
| def _convert_token_to_id(self, token): |
| """ Converts a token (str) in an id using the vocab. """ |
| return self.vocab.get(token, self.vocab.get(self.unk_token)) |
| |
| |
| def encode(self, token_array): |
| token_ids = [] |
| token_ids.append(2) |
| for token in token_array: |
| id = self._convert_token_to_id(token) |
| token_ids.append(id) |
| token_ids.append(3) |
| token_ids = torch.tensor([token_ids]) |
| attn_mask = torch.ones_like(token_ids) |
| return {'input_ids': token_ids, 'attention_mask': attn_mask} |
| |
| def decode(self, token_ids, skip_special_tokens=True): |
| token_ids = token_ids.squeeze(0).cpu().tolist() |
| token_array = [] |
| for idx in token_ids: |
| if idx == 3: |
| break |
| if skip_special_tokens and idx in self.all_special_ids: |
| continue |
| token = self._convert_id_to_token(idx) |
| token_array.append(token) |
| sequence = "".join(token_array) |
| return sequence |
| |
| def batch_decode(self, batch_token_ids, skip_special_tokens=True): |
| sequences = [] |
| for token_ids in batch_token_ids: |
| sequences.append(self.decode(token_ids)) |
| return sequences |
| |
| def get_token_split(self, token_ids): |
| if isinstance(token_ids, torch.Tensor): |
| token_ids = token_ids.cpu().tolist() |
| |
| token_array = [] |
| for seq_ids in token_ids: |
| seq_array = [] |
| for id in seq_ids: |
| token = self._convert_id_to_token(id) |
| seq_array.append(token) |
| token_array.append(seq_array) |
| |
| return token_array |
| |
| def _convert_id_to_token(self, index): |
| """Converts an index (integer) in a token (str) using the vocab.""" |
| return self.ids_to_tokens.get(index, self.unk_token) |
|
|
| def convert_tokens_to_string(self, tokens): |
| """ Converts a sequence of tokens (string) in a single string. """ |
| out_string = " ".join(tokens).replace(" ##", "").strip() |
| return out_string |
|
|
| def build_inputs_with_special_tokens( |
| self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None |
| ) -> List[int]: |
| """ |
| Build model inputs from a sequence or a pair of sequence for sequence classification tasks |
| by concatenating and adding special tokens. |
| A BERT sequence has the following format: |
| - single sequence: ``[CLS] X [SEP]`` |
| - pair of sequences: ``[CLS] A [SEP] B [SEP]`` |
| Args: |
| token_ids_0 (:obj:`List[int]`): |
| List of IDs to which the special tokens will be added |
| token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`): |
| Optional second list of IDs for sequence pairs. |
| Returns: |
| :obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens. |
| """ |
| if token_ids_1 is None: |
| return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] |
| cls = [self.cls_token_id] |
| sep = [self.sep_token_id] |
| return cls + token_ids_0 + sep + token_ids_1 + sep |
|
|
| def get_special_tokens_mask( |
| self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False |
| ) -> List[int]: |
| """ |
| Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding |
| special tokens using the tokenizer ``prepare_for_model`` method. |
| Args: |
| token_ids_0 (:obj:`List[int]`): |
| List of ids. |
| token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`): |
| Optional second list of IDs for sequence pairs. |
| already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`): |
| Set to True if the token list is already formatted with special tokens for the model |
| Returns: |
| :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. |
| """ |
|
|
| if already_has_special_tokens: |
| if token_ids_1 is not None: |
| raise ValueError( |
| "You should not supply a second sequence if the provided sequence of " |
| "ids is already formated with special tokens for the model." |
| ) |
| return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0)) |
|
|
| if token_ids_1 is not None: |
| return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1] |
| return [1] + ([0] * len(token_ids_0)) + [1] |
|
|
| def create_token_type_ids_from_sequences( |
| self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None |
| ) -> List[int]: |
| """ |
| Creates a mask from the two sequences passed to be used in a sequence-pair classification task. |
| A BERT sequence pair mask has the following format: |
| :: |
| 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 |
| | first sequence | second sequence | |
| if token_ids_1 is None, only returns the first portion of the mask (0's). |
| Args: |
| token_ids_0 (:obj:`List[int]`): |
| List of ids. |
| token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`): |
| Optional second list of IDs for sequence pairs. |
| Returns: |
| :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given |
| sequence(s). |
| """ |
| sep = [self.sep_token_id] |
| cls = [self.cls_token_id] |
| if token_ids_1 is None: |
| return len(cls + token_ids_0 + sep) * [0] |
| return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] |
|
|
| def save_vocabulary(self, vocab_path): |
| """ |
| Save the sentencepiece vocabulary (copy original file) and special tokens file to a directory. |
| Args: |
| vocab_path (:obj:`str`): |
| The directory in which to save the vocabulary. |
| Returns: |
| :obj:`Tuple(str)`: Paths to the files saved. |
| """ |
| index = 0 |
| vocab_file = vocab_path |
| with open(vocab_file, "w", encoding="utf-8") as writer: |
| for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]): |
| if index != token_index: |
| index = token_index |
| writer.write(token + "\n") |
| index += 1 |
| return (vocab_file,) |
|
|
| class SMILES_Atomwise_Tokenizer(PreTrainedTokenizer): |
| r""" |
| Constructs a SMILES tokenizer. Based on SMILES Pair Encoding (https://github.com/XinhaoLi74/SmilesPE). |
| This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users |
| should refer to the superclass for more information regarding methods. |
| Args: |
| vocab_file (:obj:`string`): |
| File containing the vocabulary. |
| unk_token (:obj:`string`, `optional`, defaults to "[UNK]"): |
| The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this |
| token instead. |
| sep_token (:obj:`string`, `optional`, defaults to "[SEP]"): |
| The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences |
| for sequence classification or for a text and a question for question answering. |
| It is also used as the last token of a sequence built with special tokens. |
| pad_token (:obj:`string`, `optional`, defaults to "[PAD]"): |
| The token used for padding, for example when batching sequences of different lengths. |
| cls_token (:obj:`string`, `optional`, defaults to "[CLS]"): |
| The classifier token which is used when doing sequence classification (classification of the whole |
| sequence instead of per-token classification). It is the first token of the sequence when built with |
| special tokens. |
| mask_token (:obj:`string`, `optional`, defaults to "[MASK]"): |
| The token used for masking values. This is the token used when training this model with masked language |
| modeling. This is the token which the model will try to predict. |
| """ |
|
|
| def __init__( |
| self, |
| vocab_file, |
| unk_token="[UNK]", |
| sep_token="[SEP]", |
| pad_token="[PAD]", |
| cls_token="[CLS]", |
| mask_token="[MASK]", |
| **kwargs |
| ): |
| super().__init__( |
| unk_token=unk_token, |
| sep_token=sep_token, |
| pad_token=pad_token, |
| cls_token=cls_token, |
| mask_token=mask_token, |
| **kwargs, |
| ) |
|
|
| if not os.path.isfile(vocab_file): |
| raise ValueError( |
| "Can't find a vocabulary file at path '{}'.".format(vocab_file) |
| ) |
| self.vocab = load_vocab(vocab_file) |
| self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()]) |
| self.tokenizer = Atomwise_Tokenizer() |
|
|
| @property |
| def vocab_size(self): |
| return len(self.vocab) |
|
|
| def get_vocab(self): |
| return dict(self.vocab, **self.added_tokens_encoder) |
|
|
| |
| def _tokenize(self, text): |
| return self.tokenizer.tokenize(text) |
|
|
| def _convert_token_to_id(self, token): |
| """ Converts a token (str) in an id using the vocab. """ |
| return self.vocab.get(token, self.vocab.get(self.unk_token)) |
|
|
| def _convert_id_to_token(self, index): |
| """Converts an index (integer) in a token (str) using the vocab.""" |
| return self.ids_to_tokens.get(index, self.unk_token) |
|
|
| def convert_tokens_to_string(self, tokens): |
| """ Converts a sequence of tokens (string) in a single string. """ |
| out_string = " ".join(tokens).replace(" ##", "").strip() |
| return out_string |
|
|
| def build_inputs_with_special_tokens( |
| self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None |
| ) -> List[int]: |
| """ |
| Build model inputs from a sequence or a pair of sequence for sequence classification tasks |
| by concatenating and adding special tokens. |
| A BERT sequence has the following format: |
| - single sequence: ``[CLS] X [SEP]`` |
| - pair of sequences: ``[CLS] A [SEP] B [SEP]`` |
| Args: |
| token_ids_0 (:obj:`List[int]`): |
| List of IDs to which the special tokens will be added |
| token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`): |
| Optional second list of IDs for sequence pairs. |
| Returns: |
| :obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens. |
| """ |
| if token_ids_1 is None: |
| return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] |
| cls = [self.cls_token_id] |
| sep = [self.sep_token_id] |
| return cls + token_ids_0 + sep + token_ids_1 + sep |
|
|
| def get_special_tokens_mask( |
| self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False |
| ) -> List[int]: |
| """ |
| Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding |
| special tokens using the tokenizer ``prepare_for_model`` method. |
| Args: |
| token_ids_0 (:obj:`List[int]`): |
| List of ids. |
| token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`): |
| Optional second list of IDs for sequence pairs. |
| already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`): |
| Set to True if the token list is already formatted with special tokens for the model |
| Returns: |
| :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. |
| """ |
|
|
| if already_has_special_tokens: |
| if token_ids_1 is not None: |
| raise ValueError( |
| "You should not supply a second sequence if the provided sequence of " |
| "ids is already formated with special tokens for the model." |
| ) |
| return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0)) |
|
|
| if token_ids_1 is not None: |
| return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1] |
| return [1] + ([0] * len(token_ids_0)) + [1] |
|
|
| def create_token_type_ids_from_sequences( |
| self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None |
| ) -> List[int]: |
| """ |
| Creates a mask from the two sequences passed to be used in a sequence-pair classification task. |
| A BERT sequence pair mask has the following format: |
| :: |
| 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 |
| | first sequence | second sequence | |
| if token_ids_1 is None, only returns the first portion of the mask (0's). |
| Args: |
| token_ids_0 (:obj:`List[int]`): |
| List of ids. |
| token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`): |
| Optional second list of IDs for sequence pairs. |
| Returns: |
| :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given |
| sequence(s). |
| """ |
| sep = [self.sep_token_id] |
| cls = [self.cls_token_id] |
| if token_ids_1 is None: |
| return len(cls + token_ids_0 + sep) * [0] |
| return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] |
|
|
| def save_vocabulary(self, vocab_path): |
| """ |
| Save the sentencepiece vocabulary (copy original file) and special tokens file to a directory. |
| Args: |
| vocab_path (:obj:`str`): |
| The directory in which to save the vocabulary. |
| Returns: |
| :obj:`Tuple(str)`: Paths to the files saved. |
| """ |
| index = 0 |
| vocab_file = vocab_path |
| with open(vocab_file, "w", encoding="utf-8") as writer: |
| for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]): |
| if index != token_index: |
| index = token_index |
| writer.write(token + "\n") |
| index += 1 |
| return (vocab_file,) |
|
|