TR2-D2 / tr2d2-pep /tokenizer /my_tokenizers.py

Sophia Tang

Initial commit

5e90249 6 months ago

19 kB

	import collections
	import os
	import re
	from typing import List, Optional
	from transformers import PreTrainedTokenizer
	from SmilesPE.tokenizer import SPE_Tokenizer
	import torch

	def load_vocab(vocab_file):
	"""Loads a vocabulary file into a dictionary."""
	vocab = collections.OrderedDict()
	with open(vocab_file, "r", encoding="utf-8") as reader:
	tokens = reader.readlines()
	for index, token in enumerate(tokens):
	token = token.rstrip("\n")
	vocab[token] = index
	return vocab

	class Atomwise_Tokenizer(object):
	"""Run atom-level SMILES tokenization"""

	def __init__(self):
	""" Constructs a atom-level Tokenizer.
	"""
	# self.regex_pattern = r"(\[[^\]]+]\|Br?\|Cl?\|N\|O\|S\|P\|F\|I\|b\|c\|n\|o\|s\|p\|$\|$\|\.\|=\|#\|-\|\+\|\\\|\/\|:\|~\|@\|\?\|>>?\|\*\|\$\|\%[0-9]{2}\|[0-9])"
	self.regex_pattern = r"($[^\($]{0,4}\)\|\[[^\]]+]\|Br?\|Cl?\|N\|O\|S\|P\|F\|I\|b\|c\|n\|o\|s\|p\|$\|$\|\.\|=\|#\|-\|\+\|\\\|\/\/?\|:\|~\|@\|\?\|>>?\|\*\|\$\|\%[0-9]{2}\|[0-9])"

	self.regex = re.compile(self.regex_pattern)

	def tokenize(self, text):
	""" Basic Tokenization of a SMILES.
	"""
	tokens = [token for token in self.regex.findall(text)]
	return tokens

	class SMILES_SPE_Tokenizer(PreTrainedTokenizer):
	r"""
	Constructs a SMILES tokenizer. Based on SMILES Pair Encoding (https://github.com/XinhaoLi74/SmilesPE).
	This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
	should refer to the superclass for more information regarding methods.
	Args:
	vocab_file (:obj:`string`):
	File containing the vocabulary.
	spe_file (:obj:`string`):
	File containing the trained SMILES Pair Encoding vocabulary.
	unk_token (:obj:`string`, `optional`, defaults to "[UNK]"):
	The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
	token instead.
	sep_token (:obj:`string`, `optional`, defaults to "[SEP]"):
	The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
	for sequence classification or for a text and a question for question answering.
	It is also used as the last token of a sequence built with special tokens.
	pad_token (:obj:`string`, `optional`, defaults to "[PAD]"):
	The token used for padding, for example when batching sequences of different lengths.
	cls_token (:obj:`string`, `optional`, defaults to "[CLS]"):
	The classifier token which is used when doing sequence classification (classification of the whole
	sequence instead of per-token classification). It is the first token of the sequence when built with
	special tokens.
	mask_token (:obj:`string`, `optional`, defaults to "[MASK]"):
	The token used for masking values. This is the token used when training this model with masked language
	modeling. This is the token which the model will try to predict.
	"""

	def __init__(self, vocab_file, spe_file,
	unk_token="[UNK]",
	sep_token="[SEP]",
	pad_token="[PAD]",
	cls_token="[CLS]",
	mask_token="[MASK]",
	**kwargs):
	if not os.path.isfile(vocab_file):
	raise ValueError("Can't find a vocabulary file at path '{}'.".format(vocab_file))
	if not os.path.isfile(spe_file):
	raise ValueError("Can't find a SPE vocabulary file at path '{}'.".format(spe_file))

	self.vocab = load_vocab(vocab_file)
	self.spe_vocab = open(spe_file, 'r', encoding='utf-8')
	self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
	self.spe_tokenizer = SPE_Tokenizer(self.spe_vocab)

	super().__init__(
	unk_token=unk_token,
	sep_token=sep_token,
	pad_token=pad_token,
	cls_token=cls_token,
	mask_token=mask_token,
	**kwargs)

	@property
	def vocab_size(self):
	return len(self.vocab)

	def get_vocab(self):
	return dict(self.vocab, **self.added_tokens_encoder)

	def _tokenize(self, text):
	return self.spe_tokenizer.tokenize(text).split(' ')

	def _convert_token_to_id(self, token):
	""" Converts a token (str) in an id using the vocab. """
	return self.vocab.get(token, self.vocab.get(self.unk_token))

	# changed encode and decode functions
	def encode(self, token_array):
	token_ids = []
	token_ids.append(2)
	for token in token_array:
	id = self._convert_token_to_id(token)
	token_ids.append(id)
	token_ids.append(3)
	token_ids = torch.tensor([token_ids])
	attn_mask = torch.ones_like(token_ids)
	return {'input_ids': token_ids, 'attention_mask': attn_mask}

	def decode(self, token_ids, skip_special_tokens=True):
	token_ids = token_ids.squeeze(0).cpu().tolist()
	token_array = []
	for idx in token_ids:
	if idx == 3: # Stop decoding when token ID 3 is encountered
	break
	if skip_special_tokens and idx in self.all_special_ids:
	continue
	token = self._convert_id_to_token(idx)
	token_array.append(token)
	sequence = "".join(token_array)
	return sequence

	def batch_decode(self, batch_token_ids, skip_special_tokens=True):
	sequences = []
	for token_ids in batch_token_ids:
	sequences.append(self.decode(token_ids))
	return sequences

	def get_token_split(self, token_ids):
	if isinstance(token_ids, torch.Tensor):
	token_ids = token_ids.cpu().tolist()

	token_array = []
	for seq_ids in token_ids:
	seq_array = []
	for id in seq_ids:
	token = self._convert_id_to_token(id)
	seq_array.append(token)
	token_array.append(seq_array)

	return token_array

	def _convert_id_to_token(self, index):
	"""Converts an index (integer) in a token (str) using the vocab."""
	return self.ids_to_tokens.get(index, self.unk_token)

	def convert_tokens_to_string(self, tokens):
	""" Converts a sequence of tokens (string) in a single string. """
	out_string = " ".join(tokens).replace(" ##", "").strip()
	return out_string

	def build_inputs_with_special_tokens(
	self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
	) -> List[int]:
	"""
	Build model inputs from a sequence or a pair of sequence for sequence classification tasks
	by concatenating and adding special tokens.
	A BERT sequence has the following format:
	- single sequence: ``[CLS] X [SEP]``
	- pair of sequences: ``[CLS] A [SEP] B [SEP]``
	Args:
	token_ids_0 (:obj:`List[int]`):
	List of IDs to which the special tokens will be added
	token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
	Optional second list of IDs for sequence pairs.
	Returns:
	:obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
	"""
	if token_ids_1 is None:
	return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
	cls = [self.cls_token_id]
	sep = [self.sep_token_id]
	return cls + token_ids_0 + sep + token_ids_1 + sep

	def get_special_tokens_mask(
	self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
	) -> List[int]:
	"""
	Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
	special tokens using the tokenizer ``prepare_for_model`` method.
	Args:
	token_ids_0 (:obj:`List[int]`):
	List of ids.
	token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
	Optional second list of IDs for sequence pairs.
	already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
	Set to True if the token list is already formatted with special tokens for the model
	Returns:
	:obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
	"""

	if already_has_special_tokens:
	if token_ids_1 is not None:
	raise ValueError(
	"You should not supply a second sequence if the provided sequence of "
	"ids is already formated with special tokens for the model."
	)
	return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))

	if token_ids_1 is not None:
	return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
	return [1] + ([0] * len(token_ids_0)) + [1]

	def create_token_type_ids_from_sequences(
	self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
	) -> List[int]:
	"""
	Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
	A BERT sequence pair mask has the following format:
	::
	0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
	\| first sequence \| second sequence \|
	if token_ids_1 is None, only returns the first portion of the mask (0's).
	Args:
	token_ids_0 (:obj:`List[int]`):
	List of ids.
	token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
	Optional second list of IDs for sequence pairs.
	Returns:
	:obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
	sequence(s).
	"""
	sep = [self.sep_token_id]
	cls = [self.cls_token_id]
	if token_ids_1 is None:
	return len(cls + token_ids_0 + sep) * [0]
	return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]

	def save_vocabulary(self, vocab_path):
	"""
	Save the sentencepiece vocabulary (copy original file) and special tokens file to a directory.
	Args:
	vocab_path (:obj:`str`):
	The directory in which to save the vocabulary.
	Returns:
	:obj:`Tuple(str)`: Paths to the files saved.
	"""
	index = 0
	vocab_file = vocab_path
	with open(vocab_file, "w", encoding="utf-8") as writer:
	for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
	if index != token_index:
	index = token_index
	writer.write(token + "\n")
	index += 1
	return (vocab_file,)

	class SMILES_Atomwise_Tokenizer(PreTrainedTokenizer):
	r"""
	Constructs a SMILES tokenizer. Based on SMILES Pair Encoding (https://github.com/XinhaoLi74/SmilesPE).
	This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
	should refer to the superclass for more information regarding methods.
	Args:
	vocab_file (:obj:`string`):
	File containing the vocabulary.
	unk_token (:obj:`string`, `optional`, defaults to "[UNK]"):
	The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
	token instead.
	sep_token (:obj:`string`, `optional`, defaults to "[SEP]"):
	The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
	for sequence classification or for a text and a question for question answering.
	It is also used as the last token of a sequence built with special tokens.
	pad_token (:obj:`string`, `optional`, defaults to "[PAD]"):
	The token used for padding, for example when batching sequences of different lengths.
	cls_token (:obj:`string`, `optional`, defaults to "[CLS]"):
	The classifier token which is used when doing sequence classification (classification of the whole
	sequence instead of per-token classification). It is the first token of the sequence when built with
	special tokens.
	mask_token (:obj:`string`, `optional`, defaults to "[MASK]"):
	The token used for masking values. This is the token used when training this model with masked language
	modeling. This is the token which the model will try to predict.
	"""

	def __init__(
	self,
	vocab_file,
	unk_token="[UNK]",
	sep_token="[SEP]",
	pad_token="[PAD]",
	cls_token="[CLS]",
	mask_token="[MASK]",
	**kwargs
	):
	super().__init__(
	unk_token=unk_token,
	sep_token=sep_token,
	pad_token=pad_token,
	cls_token=cls_token,
	mask_token=mask_token,
	**kwargs,
	)

	if not os.path.isfile(vocab_file):
	raise ValueError(
	"Can't find a vocabulary file at path '{}'.".format(vocab_file)
	)
	self.vocab = load_vocab(vocab_file)
	self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
	self.tokenizer = Atomwise_Tokenizer()

	@property
	def vocab_size(self):
	return len(self.vocab)

	def get_vocab(self):
	return dict(self.vocab, **self.added_tokens_encoder)


	def _tokenize(self, text):
	return self.tokenizer.tokenize(text)

	def _convert_token_to_id(self, token):
	""" Converts a token (str) in an id using the vocab. """
	return self.vocab.get(token, self.vocab.get(self.unk_token))

	def _convert_id_to_token(self, index):
	"""Converts an index (integer) in a token (str) using the vocab."""
	return self.ids_to_tokens.get(index, self.unk_token)

	def convert_tokens_to_string(self, tokens):
	""" Converts a sequence of tokens (string) in a single string. """
	out_string = " ".join(tokens).replace(" ##", "").strip()
	return out_string

	def build_inputs_with_special_tokens(
	self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
	) -> List[int]:
	"""
	Build model inputs from a sequence or a pair of sequence for sequence classification tasks
	by concatenating and adding special tokens.
	A BERT sequence has the following format:
	- single sequence: ``[CLS] X [SEP]``
	- pair of sequences: ``[CLS] A [SEP] B [SEP]``
	Args:
	token_ids_0 (:obj:`List[int]`):
	List of IDs to which the special tokens will be added
	token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
	Optional second list of IDs for sequence pairs.
	Returns:
	:obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
	"""
	if token_ids_1 is None:
	return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
	cls = [self.cls_token_id]
	sep = [self.sep_token_id]
	return cls + token_ids_0 + sep + token_ids_1 + sep

	def get_special_tokens_mask(
	self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
	) -> List[int]:
	"""
	Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
	special tokens using the tokenizer ``prepare_for_model`` method.
	Args:
	token_ids_0 (:obj:`List[int]`):
	List of ids.
	token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
	Optional second list of IDs for sequence pairs.
	already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
	Set to True if the token list is already formatted with special tokens for the model
	Returns:
	:obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
	"""

	if already_has_special_tokens:
	if token_ids_1 is not None:
	raise ValueError(
	"You should not supply a second sequence if the provided sequence of "
	"ids is already formated with special tokens for the model."
	)
	return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))

	if token_ids_1 is not None:
	return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
	return [1] + ([0] * len(token_ids_0)) + [1]

	def create_token_type_ids_from_sequences(
	self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
	) -> List[int]:
	"""
	Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
	A BERT sequence pair mask has the following format:
	::
	0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
	\| first sequence \| second sequence \|
	if token_ids_1 is None, only returns the first portion of the mask (0's).
	Args:
	token_ids_0 (:obj:`List[int]`):
	List of ids.
	token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
	Optional second list of IDs for sequence pairs.
	Returns:
	:obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
	sequence(s).
	"""
	sep = [self.sep_token_id]
	cls = [self.cls_token_id]
	if token_ids_1 is None:
	return len(cls + token_ids_0 + sep) * [0]
	return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]

	def save_vocabulary(self, vocab_path):
	"""
	Save the sentencepiece vocabulary (copy original file) and special tokens file to a directory.
	Args:
	vocab_path (:obj:`str`):
	The directory in which to save the vocabulary.
	Returns:
	:obj:`Tuple(str)`: Paths to the files saved.
	"""
	index = 0
	vocab_file = vocab_path
	with open(vocab_file, "w", encoding="utf-8") as writer:
	for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
	if index != token_index:
	index = token_index
	writer.write(token + "\n")
	index += 1
	return (vocab_file,)