Spaces:

alibtsd
/

FlowProt

Running

App Files Files Community

FlowProt / model /ProteinMPNN /helper_scripts /parse_multiple_chains.py

alibtsd

Deploy FlowProt Docker Space

f34af6f verified 15 days ago

Raw

History Blame Contribute Delete

7.3 kB

	import argparse


	def main(args):

	import glob
	import gzip
	import json
	import os
	import time

	import numpy as np

	folder_with_pdbs_path = args.input_path
	save_path = args.output_path
	ca_only = args.ca_only

	alpha_1 = list("ARNDCQEGHILKMFPSTWYV-")
	states = len(alpha_1)
	alpha_3 = [
	"ALA",
	"ARG",
	"ASN",
	"ASP",
	"CYS",
	"GLN",
	"GLU",
	"GLY",
	"HIS",
	"ILE",
	"LEU",
	"LYS",
	"MET",
	"PHE",
	"PRO",
	"SER",
	"THR",
	"TRP",
	"TYR",
	"VAL",
	"GAP",
	]

	aa_1_N = {a: n for n, a in enumerate(alpha_1)}
	aa_3_N = {a: n for n, a in enumerate(alpha_3)}
	aa_N_1 = {n: a for n, a in enumerate(alpha_1)}
	aa_1_3 = {a: b for a, b in zip(alpha_1, alpha_3)}
	aa_3_1 = {b: a for a, b in zip(alpha_1, alpha_3)}

	def AA_to_N(x):
	# ["ARND"] -> [[0,1,2,3]]
	x = np.array(x)
	if x.ndim == 0:
	x = x[None]
	return [[aa_1_N.get(a, states - 1) for a in y] for y in x]

	def N_to_AA(x):
	# [[0,1,2,3]] -> ["ARND"]
	x = np.array(x)
	if x.ndim == 1:
	x = x[None]
	return ["".join([aa_N_1.get(a, "-") for a in y]) for y in x]

	def parse_PDB_biounits(x, atoms=["N", "CA", "C"], chain=None):
	"""
	input: x = PDB filename
	atoms = atoms to extract (optional)
	output: (length, atoms, coords=(x,y,z)), sequence
	"""
	xyz, seq, min_resn, max_resn = {}, {}, 1e6, -1e6
	for line in open(x, "rb"):
	line = line.decode("utf-8", "ignore").rstrip()

	if line[:6] == "HETATM" and line[17 : 17 + 3] == "MSE":
	line = line.replace("HETATM", "ATOM ")
	line = line.replace("MSE", "MET")

	if line[:4] == "ATOM":
	ch = line[21:22]
	if ch == chain or chain is None:
	atom = line[12 : 12 + 4].strip()
	resi = line[17 : 17 + 3]
	resn = line[22 : 22 + 5].strip()
	x, y, z = [float(line[i : (i + 8)]) for i in [30, 38, 46]]

	if resn[-1].isalpha():
	resa, resn = resn[-1], int(resn[:-1]) - 1
	else:
	resa, resn = "", int(resn) - 1
	# resn = int(resn)
	if resn < min_resn:
	min_resn = resn
	if resn > max_resn:
	max_resn = resn
	if resn not in xyz:
	xyz[resn] = {}
	if resa not in xyz[resn]:
	xyz[resn][resa] = {}
	if resn not in seq:
	seq[resn] = {}
	if resa not in seq[resn]:
	seq[resn][resa] = resi

	if atom not in xyz[resn][resa]:
	xyz[resn][resa][atom] = np.array([x, y, z])

	# convert to numpy arrays, fill in missing values
	seq_, xyz_ = [], []
	try:
	for resn in range(min_resn, max_resn + 1):
	if resn in seq:
	for k in sorted(seq[resn]):
	seq_.append(aa_3_N.get(seq[resn][k], 20))
	else:
	seq_.append(20)
	if resn in xyz:
	for k in sorted(xyz[resn]):
	for atom in atoms:
	if atom in xyz[resn][k]:
	xyz_.append(xyz[resn][k][atom])
	else:
	xyz_.append(np.full(3, np.nan))
	else:
	for atom in atoms:
	xyz_.append(np.full(3, np.nan))
	return np.array(xyz_).reshape(-1, len(atoms), 3), N_to_AA(np.array(seq_))
	except TypeError:
	return "no_chain", "no_chain"

	pdb_dict_list = []
	c = 0

	if folder_with_pdbs_path[-1] != "/":
	folder_with_pdbs_path = folder_with_pdbs_path + "/"

	init_alphabet = [
	"A",
	"B",
	"C",
	"D",
	"E",
	"F",
	"G",
	"H",
	"I",
	"J",
	"K",
	"L",
	"M",
	"N",
	"O",
	"P",
	"Q",
	"R",
	"S",
	"T",
	"U",
	"V",
	"W",
	"X",
	"Y",
	"Z",
	"a",
	"b",
	"c",
	"d",
	"e",
	"f",
	"g",
	"h",
	"i",
	"j",
	"k",
	"l",
	"m",
	"n",
	"o",
	"p",
	"q",
	"r",
	"s",
	"t",
	"u",
	"v",
	"w",
	"x",
	"y",
	"z",
	]
	extra_alphabet = [str(item) for item in list(np.arange(300))]
	chain_alphabet = init_alphabet + extra_alphabet

	biounit_names = glob.glob(folder_with_pdbs_path + "*.pdb")
	for biounit in biounit_names:
	my_dict = {}
	s = 0
	concat_seq = ""
	concat_N = []
	concat_CA = []
	concat_C = []
	concat_O = []
	concat_mask = []
	coords_dict = {}
	for letter in chain_alphabet:
	if ca_only:
	sidechain_atoms = ["CA"]
	else:
	sidechain_atoms = ["N", "CA", "C", "O"]
	xyz, seq = parse_PDB_biounits(biounit, atoms=sidechain_atoms, chain=letter)
	if type(xyz) != str:
	concat_seq += seq[0]
	my_dict["seq_chain_" + letter] = seq[0]
	coords_dict_chain = {}
	if ca_only:
	coords_dict_chain["CA_chain_" + letter] = xyz.tolist()
	else:
	coords_dict_chain["N_chain_" + letter] = xyz[:, 0, :].tolist()
	coords_dict_chain["CA_chain_" + letter] = xyz[:, 1, :].tolist()
	coords_dict_chain["C_chain_" + letter] = xyz[:, 2, :].tolist()
	coords_dict_chain["O_chain_" + letter] = xyz[:, 3, :].tolist()
	my_dict["coords_chain_" + letter] = coords_dict_chain
	s += 1
	fi = biounit.rfind("/")
	my_dict["name"] = biounit[(fi + 1) : -4]
	my_dict["num_of_chains"] = s
	my_dict["seq"] = concat_seq
	if s < len(chain_alphabet):
	pdb_dict_list.append(my_dict)
	c += 1

	with open(save_path, "w") as f:
	for entry in pdb_dict_list:
	f.write(json.dumps(entry) + "\n")


	if __name__ == "__main__":
	argparser = argparse.ArgumentParser(
	formatter_class=argparse.ArgumentDefaultsHelpFormatter
	)

	argparser.add_argument(
	"--input_path",
	type=str,
	help="Path to a folder with pdb files, e.g. /home/my_pdbs/",
	)
	argparser.add_argument(
	"--output_path",
	type=str,
	help="Path where to save .jsonl dictionary of parsed pdbs",
	)
	argparser.add_argument(
	"--ca_only",
	action="store_true",
	default=False,
	help="parse a backbone-only structure (default: false)",
	)

	args = argparser.parse_args()
	main(args)