Spaces:

ai-gero
/

ProtoBind-Diff

Sleeping

ProtoBind-Diff / protobind_diff /ligands /smiles_tokenizer.py

vladimir.manuylov

initial commit

bd082dc 6 months ago

5.65 kB

	# Taken from https://github.com/MolecularAI/Chemformer/
	from typing import Any, Dict, List, Optional, Tuple, Union
	from pysmilesutils.tokenize import SMILESTokenizer

	class ChemformerTokenizer(SMILESTokenizer):
	"""
	Tokenizer for the Chemformer.

	There are a few different features that sets this apart from the `SMILESTokenizer`:
	* It reserves two extra special tokens, "mask" and "sep"
	* It distinguish between chemical and non-chemical tokens

	:param smiles: A list of SMILES that are used to create the vocabulary for the tokenizer. Defaults to None.
	:param tokens: A list of tokens (strings) that the tokenizer uses when tokenizing SMILES. Defaults to None.
	:param regex_token_patterns: A list of regular expressions that the tokenizer uses when tokenizing SMILES.
	:param beginning_of_smiles_token: Token that is added to beginning of SMILES. Defaults to "^".
	:param end_of_smiles_token: Token that is added to the end of SMILES. Defaults to "&".
	:param padding_token: Token used for padding. Defalts to " ".
	:param unknown_token: Token that is used for unknown ids when decoding encoded data. Defaults to "?".
	:param mask_token: Token that is used by the Masker
	:param sep_token: Token that is used to separate sentences, currently unused
	:param filename: if given and `smiles` is None, load the vocabulary from disc
	:raises: ValueError: If the `encoding_type` is invalid.
	"""

	def __init__(
	self,
	smiles: List[str] = None,
	tokens: List[str] = None,
	regex_token_patterns: List[str] = None,
	beginning_of_smiles_token: str = "^",
	end_of_smiles_token: str = "&",
	padding_token: str = "<PAD>",
	unknown_token: str = "?",
	mask_token: str = "<MASK>",
	sep_token: str = "<SEP>",
	filename: str = None,
	) -> None:
	self._mask_token = mask_token
	self._sep_token = sep_token
	self._chem_start_idx = 6 # Default, number of special tokens + 1
	self._chem_token_idxs: Optional[List[int]] = None
	super().__init__(
	smiles=smiles,
	tokens=tokens,
	regex_token_patterns=regex_token_patterns,
	beginning_of_smiles_token=beginning_of_smiles_token,
	end_of_smiles_token=end_of_smiles_token,
	padding_token=padding_token,
	unknown_token=unknown_token,
	encoding_type="index",
	filename=filename,
	)


	@property
	def chem_token_idxs(self) -> List[int]:
	"""Returns the indices of the vocabulary that are chemical tokens"""
	if self._chem_token_idxs is None:
	self._chem_token_idxs = list(range(self._chem_start_idx, len(self.vocabulary)))
	return self._chem_token_idxs

	@property
	def mask_token_id(self):
	"""Get the mask token id"""
	return self.vocabulary[self._mask_token]

	@property
	def vocab_size(self):
	return len(self.vocabulary)

	@property
	def special_tokens(self) -> Dict[str, str]:
	"""Returns a dictionary of non-character tokens"""
	return {
	"start": self._beginning_of_smiles_token,
	"end": self._end_of_smiles_token,
	"pad": self._padding_token,
	"unknown": self._unknown_token,
	"mask": self._mask_token,
	"sep": self._sep_token,
	}

	def add_tokens(self, tokens: List[str], regex: bool = False, smiles=None) -> None:
	"""Adds tokens to the classes list of tokens.

	The new tokens are added to the front of the token list and take priority over old tokens. Note that that the
	vocabulary of the tokenizer is not updated after the tokens are added,
	and must be updated by calling `create_vocabulary_from_smiles`.

	If `regex` is False, the tokens are interpreted as non-chemical tokens, which distinguish
	them for processing by e.g. the masker.

	:param tokens: List of tokens to be added.
	:param regex: If `True` the input tokens are treated as
	regular expressions and are added to the list of regular expressions
	instead of token list. Defaults to False.
	:param smiles: If a list of smiles is provided, the vocabulary will be created, defaults to None

	:raises ValueError: If any of the tokens supplied are already in the list
	of tokens.
	"""
	super().add_tokens(tokens, regex, smiles)
	if not regex:
	self._chem_start_idx += len(tokens)
	self._chem_token_idxs = None

	def _reset_vocabulary(self) -> Dict[str, int]:
	"""Create a new tokens vocabulary.

	:return: New tokens vocabulary
	"""
	dict_ = {
	self._padding_token: 0,
	self._unknown_token: 1,
	self._beginning_of_smiles_token: 2,
	self._end_of_smiles_token: 3,
	self._mask_token: 4,
	self._sep_token: 5,
	}
	for token in self._tokens:
	dict_.setdefault(token, len(dict_))
	return dict_

	def _state_properties(self) -> Dict[str, Any]:
	"""Return properties to reconstruct the internal state of the tokenizer"""
	dict_ = super()._state_properties()
	dict_["chem_start_idx"] = self._chem_start_idx
	return dict_

	def _update_state(self, dict_: Dict[str, Any]) -> None:
	"""Update the internal state with properties loaded from disc"""
	super()._update_state(dict_)
	self._chem_start_idx = dict_["chem_start_idx"]
	self._chem_token_idxs = None