Spaces:
Build error
Build error
| # author: Jan Velecky, adapted from Pedro | |
| import os | |
| import numpy as np | |
| import gzip | |
| import pickle | |
| ROOT_DIR = os.path.dirname(__file__) + '/../../' | |
| RAW_DIR = ROOT_DIR + '/data_raw/pdbs/' | |
| OPTIMS_DIR = ROOT_DIR + '/data_temp/optimized-pdbs/' | |
| MUTANTS_DIR = ROOT_DIR + '/data_temp/mutated-pdbs/' | |
| PREPRO_DIR = ROOT_DIR + '/data_preprocessed/' | |
| CHAINS_DIR = PREPRO_DIR + '/chains/' | |
| # should not contain multi-chains | |
| # should deal with partial residues | |
| # mutated residues should be in the structure | |
| import warnings | |
| warnings.filterwarnings("ignore") | |
| from Bio.PDB.PDBParser import PDBParser | |
| from Bio.PDB.Polypeptide import is_aa, three_to_one | |
| parser = PDBParser(PERMISSIVE=1) | |
| # a dict of {pos: aa} with some str compability | |
| class Type_Seq(dict): | |
| def __init__(self, *arg, **kw): | |
| super().__init__(*arg, **kw) | |
| def __str__(self): # directly convertible to amino-acid sequence string | |
| return ''.join(self.values()) | |
| def __repr__(self): | |
| return str(dict(self)) | |
| def __iter__(self): # dict default is over the keys | |
| yield from self.values() | |
| # def __getitem__(self, subscript): # slicing would be nice to have, but should it slice by indexes or as sequence? Who knows. | |
| # if isinstance(subscript, slice): | |
| # return self.items() print(subscript.start, subscript.stop, subscript.step) | |
| # else: | |
| # return dict[subscript] | |
| def copy(self): # sing super's (dict's) copy would create another dict, not a Type_Seq | |
| return Type_Seq(dict(self)) | |
| def _process_pdb(filepath, gap_detect=True): | |
| structure_name = filepath.split('/')[-1][:-4] | |
| with open(filepath, 'rt') as ifh: | |
| structure = parser.get_structure(structure_name, ifh) | |
| for chain in structure[0]: | |
| res_aas = {} # sequence from the pdb | |
| res_pos = [] # positions of Cα | |
| seqid1 = None | |
| for residue in chain: | |
| if is_aa(residue.get_resname()): | |
| if 'CA' in residue: | |
| _, seqid, ins_code = residue.id | |
| if(gap_detect): | |
| if seqid1: | |
| gap = seqid - seqid1 | |
| if gap > 8: | |
| print("Gap of %i AAs detected at %s" % (gap, str(residue.full_id))) | |
| seqid1 = seqid | |
| seqid = (str(seqid) + ins_code).strip() # remove ' ' for an empty insertion code | |
| try: # std amino acids | |
| res_aas[seqid] = three_to_one(residue.get_resname()) | |
| except KeyError as e: | |
| res_aas[seqid] = 'X' | |
| atom_ca = residue['CA'] | |
| res_pos.append(atom_ca.get_coord()) | |
| res_pos = np.array(res_pos) | |
| yield chain.id, Type_Seq(res_aas), res_pos | |
| def process_pdb(filepath, chain: str = None): | |
| chains = _process_pdb(filepath) | |
| if not chain: # return generator (for all chains) | |
| return chains | |
| for ch in chains: # return the specified chain | |
| if ch[0] == chain: | |
| return ch[1:] | |
| def mutate_seq(res_aas: Type_Seq, wildtype: list, location: list, mutation: list) -> Type_Seq: | |
| res_aas = res_aas.copy() | |
| for w,l,m in zip(wildtype, location, mutation): | |
| if res_aas[l] != w: | |
| raise ValueError("Wildtype residue mismatch: %s is actually %s" % (''.join((w,str(l))), res_aas[l])) | |
| res_aas[l] = m | |
| return res_aas | |
| if __name__ == "__main__": | |
| list_pdbs = [] | |
| set_pdbs = set() | |
| # process all PDBs | |
| for pdb_path in [OPTIMS_DIR, RAW_DIR, MUTANTS_DIR]: | |
| # use RAW structure if optimized not found | |
| pdbs = {f for f in os.listdir(pdb_path) if os.path.isfile(os.path.join(pdb_path, f))} | |
| pdbs -= set_pdbs | |
| set_pdbs |= pdbs # merge sets | |
| list_pdbs += [os.path.join(pdb_path, f) for f in pdbs] | |
| total_chains = 0 | |
| os.makedirs(CHAINS_DIR, exist_ok=True) # folders for preprocessed data | |
| with open(PREPRO_DIR+"chain_list_pdb.fasta", 'w') as chain_list_file: | |
| for cur_iter, cur_pdb in enumerate(list_pdbs): | |
| structure_name = cur_pdb.split('/')[-1][:-4].lower() | |
| if cur_iter% 100 == 0: | |
| print("%i/%i %s" % (cur_iter, len(list_pdbs), structure_name)) | |
| for chain_id, seq, res_pos in process_pdb(cur_pdb): # todo: _process_pdb(cur_pdb, gap_detect=False) | |
| if len(seq) > 10: | |
| total_chains += 1 | |
| chain_list_file.write(">"+structure_name+"."+chain_id+"\n") | |
| chain_list_file.write(seq+"\n") | |
| np.save(CHAINS_DIR+structure_name+"."+chain_id, res_pos) | |
| print("Total chains:", total_chains) |