# author: Jan Velecky, adapted from Pedro import os import numpy as np import gzip import pickle ROOT_DIR = os.path.dirname(__file__) + '/../../' RAW_DIR = ROOT_DIR + '/data_raw/pdbs/' OPTIMS_DIR = ROOT_DIR + '/data_temp/optimized-pdbs/' MUTANTS_DIR = ROOT_DIR + '/data_temp/mutated-pdbs/' PREPRO_DIR = ROOT_DIR + '/data_preprocessed/' CHAINS_DIR = PREPRO_DIR + '/chains/' # should not contain multi-chains # should deal with partial residues # mutated residues should be in the structure import warnings warnings.filterwarnings("ignore") from Bio.PDB.PDBParser import PDBParser from Bio.PDB.Polypeptide import is_aa, three_to_one parser = PDBParser(PERMISSIVE=1) # a dict of {pos: aa} with some str compability class Type_Seq(dict): def __init__(self, *arg, **kw): super().__init__(*arg, **kw) def __str__(self): # directly convertible to amino-acid sequence string return ''.join(self.values()) def __repr__(self): return str(dict(self)) def __iter__(self): # dict default is over the keys yield from self.values() # def __getitem__(self, subscript): # slicing would be nice to have, but should it slice by indexes or as sequence? Who knows. # if isinstance(subscript, slice): # return self.items() print(subscript.start, subscript.stop, subscript.step) # else: # return dict[subscript] def copy(self): # sing super's (dict's) copy would create another dict, not a Type_Seq return Type_Seq(dict(self)) def _process_pdb(filepath, gap_detect=True): structure_name = filepath.split('/')[-1][:-4] with open(filepath, 'rt') as ifh: structure = parser.get_structure(structure_name, ifh) for chain in structure[0]: res_aas = {} # sequence from the pdb res_pos = [] # positions of Cα seqid1 = None for residue in chain: if is_aa(residue.get_resname()): if 'CA' in residue: _, seqid, ins_code = residue.id if(gap_detect): if seqid1: gap = seqid - seqid1 if gap > 8: print("Gap of %i AAs detected at %s" % (gap, str(residue.full_id))) seqid1 = seqid seqid = (str(seqid) + ins_code).strip() # remove ' ' for an empty insertion code try: # std amino acids res_aas[seqid] = three_to_one(residue.get_resname()) except KeyError as e: res_aas[seqid] = 'X' atom_ca = residue['CA'] res_pos.append(atom_ca.get_coord()) res_pos = np.array(res_pos) yield chain.id, Type_Seq(res_aas), res_pos def process_pdb(filepath, chain: str = None): chains = _process_pdb(filepath) if not chain: # return generator (for all chains) return chains for ch in chains: # return the specified chain if ch[0] == chain: return ch[1:] def mutate_seq(res_aas: Type_Seq, wildtype: list, location: list, mutation: list) -> Type_Seq: res_aas = res_aas.copy() for w,l,m in zip(wildtype, location, mutation): if res_aas[l] != w: raise ValueError("Wildtype residue mismatch: %s is actually %s" % (''.join((w,str(l))), res_aas[l])) res_aas[l] = m return res_aas if __name__ == "__main__": list_pdbs = [] set_pdbs = set() # process all PDBs for pdb_path in [OPTIMS_DIR, RAW_DIR, MUTANTS_DIR]: # use RAW structure if optimized not found pdbs = {f for f in os.listdir(pdb_path) if os.path.isfile(os.path.join(pdb_path, f))} pdbs -= set_pdbs set_pdbs |= pdbs # merge sets list_pdbs += [os.path.join(pdb_path, f) for f in pdbs] total_chains = 0 os.makedirs(CHAINS_DIR, exist_ok=True) # folders for preprocessed data with open(PREPRO_DIR+"chain_list_pdb.fasta", 'w') as chain_list_file: for cur_iter, cur_pdb in enumerate(list_pdbs): structure_name = cur_pdb.split('/')[-1][:-4].lower() if cur_iter% 100 == 0: print("%i/%i %s" % (cur_iter, len(list_pdbs), structure_name)) for chain_id, seq, res_pos in process_pdb(cur_pdb): # todo: _process_pdb(cur_pdb, gap_detect=False) if len(seq) > 10: total_chains += 1 chain_list_file.write(">"+structure_name+"."+chain_id+"\n") chain_list_file.write(seq+"\n") np.save(CHAINS_DIR+structure_name+"."+chain_id, res_pos) print("Total chains:", total_chains)