| import argparse |
|
|
|
|
| def main(args): |
|
|
| import glob |
| import gzip |
| import json |
| import os |
| import time |
|
|
| import numpy as np |
|
|
| folder_with_pdbs_path = args.input_path |
| save_path = args.output_path |
| ca_only = args.ca_only |
|
|
| alpha_1 = list("ARNDCQEGHILKMFPSTWYV-") |
| states = len(alpha_1) |
| alpha_3 = [ |
| "ALA", |
| "ARG", |
| "ASN", |
| "ASP", |
| "CYS", |
| "GLN", |
| "GLU", |
| "GLY", |
| "HIS", |
| "ILE", |
| "LEU", |
| "LYS", |
| "MET", |
| "PHE", |
| "PRO", |
| "SER", |
| "THR", |
| "TRP", |
| "TYR", |
| "VAL", |
| "GAP", |
| ] |
|
|
| aa_1_N = {a: n for n, a in enumerate(alpha_1)} |
| aa_3_N = {a: n for n, a in enumerate(alpha_3)} |
| aa_N_1 = {n: a for n, a in enumerate(alpha_1)} |
| aa_1_3 = {a: b for a, b in zip(alpha_1, alpha_3)} |
| aa_3_1 = {b: a for a, b in zip(alpha_1, alpha_3)} |
|
|
| def AA_to_N(x): |
| |
| x = np.array(x) |
| if x.ndim == 0: |
| x = x[None] |
| return [[aa_1_N.get(a, states - 1) for a in y] for y in x] |
|
|
| def N_to_AA(x): |
| |
| x = np.array(x) |
| if x.ndim == 1: |
| x = x[None] |
| return ["".join([aa_N_1.get(a, "-") for a in y]) for y in x] |
|
|
| def parse_PDB_biounits(x, atoms=["N", "CA", "C"], chain=None): |
| """ |
| input: x = PDB filename |
| atoms = atoms to extract (optional) |
| output: (length, atoms, coords=(x,y,z)), sequence |
| """ |
| xyz, seq, min_resn, max_resn = {}, {}, 1e6, -1e6 |
| for line in open(x, "rb"): |
| line = line.decode("utf-8", "ignore").rstrip() |
|
|
| if line[:6] == "HETATM" and line[17 : 17 + 3] == "MSE": |
| line = line.replace("HETATM", "ATOM ") |
| line = line.replace("MSE", "MET") |
|
|
| if line[:4] == "ATOM": |
| ch = line[21:22] |
| if ch == chain or chain is None: |
| atom = line[12 : 12 + 4].strip() |
| resi = line[17 : 17 + 3] |
| resn = line[22 : 22 + 5].strip() |
| x, y, z = [float(line[i : (i + 8)]) for i in [30, 38, 46]] |
|
|
| if resn[-1].isalpha(): |
| resa, resn = resn[-1], int(resn[:-1]) - 1 |
| else: |
| resa, resn = "", int(resn) - 1 |
| |
| if resn < min_resn: |
| min_resn = resn |
| if resn > max_resn: |
| max_resn = resn |
| if resn not in xyz: |
| xyz[resn] = {} |
| if resa not in xyz[resn]: |
| xyz[resn][resa] = {} |
| if resn not in seq: |
| seq[resn] = {} |
| if resa not in seq[resn]: |
| seq[resn][resa] = resi |
|
|
| if atom not in xyz[resn][resa]: |
| xyz[resn][resa][atom] = np.array([x, y, z]) |
|
|
| |
| seq_, xyz_ = [], [] |
| try: |
| for resn in range(min_resn, max_resn + 1): |
| if resn in seq: |
| for k in sorted(seq[resn]): |
| seq_.append(aa_3_N.get(seq[resn][k], 20)) |
| else: |
| seq_.append(20) |
| if resn in xyz: |
| for k in sorted(xyz[resn]): |
| for atom in atoms: |
| if atom in xyz[resn][k]: |
| xyz_.append(xyz[resn][k][atom]) |
| else: |
| xyz_.append(np.full(3, np.nan)) |
| else: |
| for atom in atoms: |
| xyz_.append(np.full(3, np.nan)) |
| return np.array(xyz_).reshape(-1, len(atoms), 3), N_to_AA(np.array(seq_)) |
| except TypeError: |
| return "no_chain", "no_chain" |
|
|
| pdb_dict_list = [] |
| c = 0 |
|
|
| if folder_with_pdbs_path[-1] != "/": |
| folder_with_pdbs_path = folder_with_pdbs_path + "/" |
|
|
| init_alphabet = [ |
| "A", |
| "B", |
| "C", |
| "D", |
| "E", |
| "F", |
| "G", |
| "H", |
| "I", |
| "J", |
| "K", |
| "L", |
| "M", |
| "N", |
| "O", |
| "P", |
| "Q", |
| "R", |
| "S", |
| "T", |
| "U", |
| "V", |
| "W", |
| "X", |
| "Y", |
| "Z", |
| "a", |
| "b", |
| "c", |
| "d", |
| "e", |
| "f", |
| "g", |
| "h", |
| "i", |
| "j", |
| "k", |
| "l", |
| "m", |
| "n", |
| "o", |
| "p", |
| "q", |
| "r", |
| "s", |
| "t", |
| "u", |
| "v", |
| "w", |
| "x", |
| "y", |
| "z", |
| ] |
| extra_alphabet = [str(item) for item in list(np.arange(300))] |
| chain_alphabet = init_alphabet + extra_alphabet |
|
|
| biounit_names = glob.glob(folder_with_pdbs_path + "*.pdb") |
| for biounit in biounit_names: |
| my_dict = {} |
| s = 0 |
| concat_seq = "" |
| concat_N = [] |
| concat_CA = [] |
| concat_C = [] |
| concat_O = [] |
| concat_mask = [] |
| coords_dict = {} |
| for letter in chain_alphabet: |
| if ca_only: |
| sidechain_atoms = ["CA"] |
| else: |
| sidechain_atoms = ["N", "CA", "C", "O"] |
| xyz, seq = parse_PDB_biounits(biounit, atoms=sidechain_atoms, chain=letter) |
| if type(xyz) != str: |
| concat_seq += seq[0] |
| my_dict["seq_chain_" + letter] = seq[0] |
| coords_dict_chain = {} |
| if ca_only: |
| coords_dict_chain["CA_chain_" + letter] = xyz.tolist() |
| else: |
| coords_dict_chain["N_chain_" + letter] = xyz[:, 0, :].tolist() |
| coords_dict_chain["CA_chain_" + letter] = xyz[:, 1, :].tolist() |
| coords_dict_chain["C_chain_" + letter] = xyz[:, 2, :].tolist() |
| coords_dict_chain["O_chain_" + letter] = xyz[:, 3, :].tolist() |
| my_dict["coords_chain_" + letter] = coords_dict_chain |
| s += 1 |
| fi = biounit.rfind("/") |
| my_dict["name"] = biounit[(fi + 1) : -4] |
| my_dict["num_of_chains"] = s |
| my_dict["seq"] = concat_seq |
| if s < len(chain_alphabet): |
| pdb_dict_list.append(my_dict) |
| c += 1 |
|
|
| with open(save_path, "w") as f: |
| for entry in pdb_dict_list: |
| f.write(json.dumps(entry) + "\n") |
|
|
|
|
| if __name__ == "__main__": |
| argparser = argparse.ArgumentParser( |
| formatter_class=argparse.ArgumentDefaultsHelpFormatter |
| ) |
|
|
| argparser.add_argument( |
| "--input_path", |
| type=str, |
| help="Path to a folder with pdb files, e.g. /home/my_pdbs/", |
| ) |
| argparser.add_argument( |
| "--output_path", |
| type=str, |
| help="Path where to save .jsonl dictionary of parsed pdbs", |
| ) |
| argparser.add_argument( |
| "--ca_only", |
| action="store_true", |
| default=False, |
| help="parse a backbone-only structure (default: false)", |
| ) |
|
|
| args = argparser.parse_args() |
| main(args) |
|
|