yuccaaa
/

ProtT3_model

Model card Files Files and versions

ProtT3_model / data_provider /gal_helpers.py

yuccaaa's picture

Add files using upload-large-folder tool

4d12519 verified 6 months ago

history blame contribute delete

1.39 kB

	import re


	# we split individual characters inside special tokens like [START_DNA]
	CUSTOM_SEQ_RE = re.compile(r"(\[START_(DNA\|SMILES\|I_SMILES\|AMINO)])(.*?)(\[END_\2])")

	# token added to implement a custom sequence tokenization. This token is added at
	# corpus cleaning step and removed in pretokenization. The digits are added to increase the chance
	# that they do not occur in the corpus. The digits are escaped so that the token does not appear
	# literally in the source code in case we ever include it in the training data.
	SPLIT_MARKER = f"SPL{1}T-TH{1}S-Pl3A5E"

	def _insert_split_marker(m: re.Match):
	"""
	Applies split marker based on a regex match of special tokens such as
	[START_DNA].

	Parameters
	----------
	n : str
	Input text to split

	Returns
	----------
	str - the text with the split token added
	"""
	start_token, _, sequence, end_token = m.groups()
	sequence = re.sub(r"(.)", fr"{SPLIT_MARKER}\1", sequence, flags=re.DOTALL)
	return f"{start_token}{sequence}{SPLIT_MARKER}{end_token}"


	def escape_custom_split_sequence(text):
	"""
	Applies custom splitting to the text for GALILEO's tokenization

	Parameters
	----------
	text : str
	Input text to split

	Returns
	----------
	str - the text with the split token added
	"""
	return CUSTOM_SEQ_RE.sub(_insert_split_marker, text)