Add files using upload-large-folder tool

9627ce0 verified 6 months ago

9.54 kB

	# Copyright Generate Biomedicines, Inc.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	"""Standard residue names for polymers of different types (e.g., L- or D-amino acid proteins,
	mixed-chirality proteins, DNA/RNA, etc.)
	"""

	from enum import Enum


	class polymerType(Enum):
	LPROT = 0
	DPROT = 1
	LDPROT = 2
	DNA = 3
	RNA = 4


	def polymer_type_name(ptype: polymerType):
	if ptype == polymerType.LPROT:
	return "polypeptide(L)"
	elif ptype == polymerType.DPROT:
	return "polypeptide(D)"
	elif ptype == polymerType.LDPROT:
	return "polypeptide(L,D)"
	elif ptype == polymerType.DNA:
	return "polydeoxyribonucleotide"
	elif ptype == polymerType.RNA:
	return "polyribonucleotide"
	else:
	raise Exception(f"unknown polymer type {ptype}")


	_res3 = [[] for _ in range(len(polymerType))]

	_res1 = [[] for _ in range(len(polymerType))]

	_res_to_idx = [dict() for _ in range(len(polymerType))]

	_unk_idx = [set() for _ in range(len(polymerType))]

	_gap_idx = [set() for _ in range(len(polymerType))]

	_stp_idx = [set() for _ in range(len(polymerType))]


	def _add_residue(ptype: polymerType, res3, res1):
	if isinstance(ptype, list):
	for pt, r3, r1 in zip(ptype, res3, res1):
	_add_residue(pt, r3, r1)
	else:
	_res_to_idx[ptype.value][res3] = len(_res3[ptype.value])
	# single-letter code is ambiguous, so take the first residue when going from single-letter code to index
	if res1 not in _res_to_idx[ptype.value]:
	_res_to_idx[ptype.value][res1] = _res_to_idx[ptype.value][res3]
	_res3[ptype.value].append(res3)
	_res1[ptype.value].append(res1)
	if res3 == "---":
	_gap_idx[ptype.value].add(_res_to_idx[ptype.value][res3])
	elif res3 == "UNK":
	_unk_idx[ptype.value].add(_res_to_idx[ptype.value][res3])
	elif res3 == "STP":
	_stp_idx[ptype.value].add(_res_to_idx[ptype.value][res3])


	def num_tokens(ptype=polymerType.LPROT):
	return len(_res3[ptype.value])


	def num_known_molecular_tokens(ptype=polymerType.LPROT):
	return sum(
	[
	not is_punctuation_index(idx) and not is_unknown(idx)
	for idx in range(len(_res3[ptype.value]))
	]
	)


	def res_to_index(res: str, ptype=polymerType.LPROT):
	return _res_to_idx[ptype.value].get(res, next(iter(_unk_idx[ptype.value])))


	def index_to_single(idx: int, ptype=polymerType.LPROT):
	return _res1[ptype.value][idx]


	def index_to_triple(idx: int, ptype=polymerType.LPROT):
	return _res3[ptype.value][idx]


	def to_single(res: str, ptype=polymerType.LPROT):
	return index_to_single(res_to_index(res, ptype))


	def to_triple(res: str, ptype=polymerType.LPROT):
	return index_to_triple(res_to_index(res, ptype))


	def is_gap_index(idx: int, ptype=polymerType.LPROT):
	return idx in _gap_idx[ptype.value]


	def is_stop_index(idx: int, ptype=polymerType.LPROT):
	return idx in _stp_idx[ptype.value]


	def is_unknown(res: str, ptype=polymerType.LPROT):
	return is_unknown_index(res_to_index(res, ptype), ptype)


	def is_unknown_index(idx: int, ptype=polymerType.LPROT):
	return idx in _unk_idx[ptype.value]


	def is_polymer_residue(res: str, ptype: polymerType):
	if ptype is None:
	# determine if this is a polymer residue for any known polymer
	for ptype in polymerType:
	if res in _res_to_idx[ptype.value]:
	return True
	return False
	return res in _res_to_idx[ptype.value]


	def is_punctuation_index(idx: int, ptype=polymerType.LPROT):
	return is_gap_index(idx, ptype) or is_stop_index(idx, ptype)


	def is_canonical(res: str, ptype=polymerType.LPROT):
	if ptype == polymerType.LPROT or ptype == polymerType.DPROT:
	idx = res_to_index(res, ptype)
	return (idx < 20) and (idx >= 0)
	elif ptype == polymerType.LDPROT:
	return is_canonical(res, polymerType.LPROT) or is_canonical(
	mirror_amino_acid(res), polymerType.DPROT
	)
	raise Exception(f"do not known how to deal with polymer type {ptype}")


	def canonical_amino_acids(ptype=polymerType.LPROT):
	canonicals = []
	for aa in _res3[ptype.value]:
	if is_canonical(aa, ptype):
	canonicals.append(aa)
	return canonicals


	_add_residue([polymerType.LPROT, polymerType.DPROT], ["ALA", "DAL"], ["A", "a"])

	_add_residue([polymerType.LPROT, polymerType.DPROT], ["CYS", "DCY"], ["C", "c"])

	_add_residue([polymerType.LPROT, polymerType.DPROT], ["ASP", "DAS"], ["D", "d"])

	_add_residue([polymerType.LPROT, polymerType.DPROT], ["GLU", "DGL"], ["E", "e"])

	_add_residue([polymerType.LPROT, polymerType.DPROT], ["PHE", "DPN"], ["F", "f"])

	_add_residue([polymerType.LPROT, polymerType.DPROT], ["GLY", "GLY"], ["G", "G"])

	_add_residue([polymerType.LPROT, polymerType.DPROT], ["HIS", "DHI"], ["H", "h"])

	_add_residue([polymerType.LPROT, polymerType.DPROT], ["ILE", "DIL"], ["I", "i"])

	_add_residue([polymerType.LPROT, polymerType.DPROT], ["LYS", "DLY"], ["K", "k"])

	_add_residue([polymerType.LPROT, polymerType.DPROT], ["LEU", "DLE"], ["L", "l"])

	_add_residue([polymerType.LPROT, polymerType.DPROT], ["MET", "MED"], ["M", "m"])

	_add_residue([polymerType.LPROT, polymerType.DPROT], ["ASN", "DSG"], ["N", "n"])

	_add_residue([polymerType.LPROT, polymerType.DPROT], ["PRO", "DPR"], ["P", "p"])

	_add_residue([polymerType.LPROT, polymerType.DPROT], ["GLN", "DGN"], ["Q", "q"])

	_add_residue([polymerType.LPROT, polymerType.DPROT], ["ARG", "DAR"], ["R", "r"])

	_add_residue([polymerType.LPROT, polymerType.DPROT], ["SER", "DSN"], ["S", "s"])

	_add_residue([polymerType.LPROT, polymerType.DPROT], ["THR", "DTH"], ["T", "t"])

	_add_residue([polymerType.LPROT, polymerType.DPROT], ["VAL", "DVA"], ["V", "v"])

	_add_residue([polymerType.LPROT, polymerType.DPROT], ["TRP", "DTR"], ["W", "w"])

	_add_residue([polymerType.LPROT, polymerType.DPROT], ["TYR", "DTY"], ["Y", "y"])

	_add_residue([polymerType.LPROT, polymerType.DPROT], ["HSD", "DSD"], ["H", "h"])

	_add_residue([polymerType.LPROT, polymerType.DPROT], ["HSE", "DSE"], ["H", "h"])

	_add_residue([polymerType.LPROT, polymerType.DPROT], ["HSC", "DSC"], ["H", "h"])

	_add_residue([polymerType.LPROT, polymerType.DPROT], ["HSP", "DSP"], ["H", "h"])

	_add_residue([polymerType.LPROT, polymerType.DPROT], ["MSE", "DMS"], ["M", "m"])

	_add_residue([polymerType.LPROT, polymerType.DPROT], ["CSO", "DCS"], ["C", "c"])

	_add_residue([polymerType.LPROT, polymerType.DPROT], ["SEC", "DEC"], ["C", "c"])

	_add_residue([polymerType.LPROT, polymerType.DPROT], ["CSX", "DCX"], ["C", "c"])

	_add_residue([polymerType.LPROT, polymerType.DPROT], ["HIP", "DHP"], ["H", "h"])

	_add_residue([polymerType.LPROT, polymerType.DPROT], ["SEP", "DEP"], ["S", "s"])

	_add_residue([polymerType.LPROT, polymerType.DPROT], ["TPO", "DTP"], ["T", "t"])

	_add_residue([polymerType.LPROT, polymerType.DPROT], ["PTR", "DPT"], ["Y", "y"])

	_add_residue([polymerType.LPROT, polymerType.DPROT], ["UNK", "UNK"], ["X", "X"])

	_add_residue([polymerType.LPROT, polymerType.DPROT], ["STP", "STP"], ["", ""])

	_add_residue([polymerType.LPROT, polymerType.DPROT], ["---", "---"], ["-", "-"])

	_add_residue([polymerType.LPROT, polymerType.DPROT], ["---", "---"], [".", "."])

	for grp in [1, 2, 3]:
	for tp in [polymerType.LPROT, polymerType.DPROT]:
	for idx in range(num_tokens(tp)):
	if grp == 1:
	if not is_punctuation_index(idx, tp) and (
	not is_unknown_index(idx, tp)
	):
	if _res3[tp.value][idx] not in _res3[polymerType.LDPROT.value]:
	_add_residue(
	polymerType.LDPROT,
	_res3[tp.value][idx],
	_res1[tp.value][idx],
	)
	elif grp == 2:
	if is_unknown_index(idx, tp):
	if _res3[tp.value][idx] not in _res3[polymerType.LDPROT.value]:
	_add_residue(
	polymerType.LDPROT,
	_res3[tp.value][idx],
	_res1[tp.value][idx],
	)
	elif grp == 3:
	if is_punctuation_index(idx, tp):
	if _res3[tp.value][idx] not in _res3[polymerType.LDPROT.value]:
	_add_residue(
	polymerType.LDPROT,
	_res3[tp.value][idx],
	_res1[tp.value][idx],
	)


	def mirror_amino_acid(res: str):
	idx = mirror_amino_acid_index(res_to_index(res, polymerType.LDPROT))
	if len(res) == 1:
	return index_to_single(idx)
	return index_to_triple(idx)


	def mirror_amino_acid_index(idx: int):
	N = num_known_molecular_tokens(polymerType.LDPROT)

	# if this is an unknown residue or a punctuation mark, return as is
	if idx >= N:
	return idx

	# otherwise, flip chirality
	return (idx + N // 2) % N