Spaces:
Build error
Build error
File size: 4,274 Bytes
b140e2c 3068eb3 b140e2c 3068eb3 b140e2c 3068eb3 b140e2c | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 | # author: Jan Velecky, adapted from Pedro
import os
import numpy as np
import gzip
import pickle
ROOT_DIR = os.path.dirname(__file__) + '/../../'
RAW_DIR = ROOT_DIR + '/data_raw/pdbs/'
OPTIMS_DIR = ROOT_DIR + '/data_temp/optimized-pdbs/'
MUTANTS_DIR = ROOT_DIR + '/data_temp/mutated-pdbs/'
PREPRO_DIR = ROOT_DIR + '/data_preprocessed/'
CHAINS_DIR = PREPRO_DIR + '/chains/'
# should not contain multi-chains
# should deal with partial residues
# mutated residues should be in the structure
import warnings
warnings.filterwarnings("ignore")
from Bio.PDB.PDBParser import PDBParser
from Bio.PDB.Polypeptide import is_aa, three_to_one
parser = PDBParser(PERMISSIVE=1)
# a dict of {pos: aa} with some str compability
class Type_Seq(dict):
def __init__(self, *arg, **kw):
super().__init__(*arg, **kw)
def __str__(self): # directly convertible to amino-acid sequence string
return ''.join(self.values())
def __repr__(self):
return str(dict(self))
def __iter__(self): # dict default is over the keys
yield from self.values()
# def __getitem__(self, subscript): # slicing would be nice to have, but should it slice by indexes or as sequence? Who knows.
# if isinstance(subscript, slice):
# return self.items() print(subscript.start, subscript.stop, subscript.step)
# else:
# return dict[subscript]
def copy(self): # sing super's (dict's) copy would create another dict, not a Type_Seq
return Type_Seq(dict(self))
def _process_pdb(filepath, gap_detect=True):
structure_name = filepath.split('/')[-1][:-4]
with open(filepath, 'rt') as ifh:
structure = parser.get_structure(structure_name, ifh)
for chain in structure[0]:
res_aas = {} # sequence from the pdb
res_pos = [] # positions of Cα
seqid1 = None
for residue in chain:
if is_aa(residue.get_resname()):
if 'CA' in residue:
_, seqid, ins_code = residue.id
if(gap_detect):
if seqid1:
gap = seqid - seqid1
if gap > 8:
print("Gap of %i AAs detected at %s" % (gap, str(residue.full_id)))
seqid1 = seqid
seqid = (str(seqid) + ins_code).strip() # remove ' ' for an empty insertion code
try: # std amino acids
res_aas[seqid] = three_to_one(residue.get_resname())
except KeyError as e:
res_aas[seqid] = 'X'
atom_ca = residue['CA']
res_pos.append(atom_ca.get_coord())
res_pos = np.array(res_pos)
yield chain.id, Type_Seq(res_aas), res_pos
def process_pdb(filepath, chain: str = None):
chains = _process_pdb(filepath)
if not chain: # return generator (for all chains)
return chains
for ch in chains: # return the specified chain
if ch[0] == chain:
return ch[1:]
def mutate_seq(res_aas: Type_Seq, wildtype: list, location: list, mutation: list) -> Type_Seq:
res_aas = res_aas.copy()
for w,l,m in zip(wildtype, location, mutation):
if res_aas[l] != w:
raise ValueError("Wildtype residue mismatch: %s is actually %s" % (''.join((w,str(l))), res_aas[l]))
res_aas[l] = m
return res_aas
if __name__ == "__main__":
list_pdbs = []
set_pdbs = set()
# process all PDBs
for pdb_path in [OPTIMS_DIR, RAW_DIR, MUTANTS_DIR]:
# use RAW structure if optimized not found
pdbs = {f for f in os.listdir(pdb_path) if os.path.isfile(os.path.join(pdb_path, f))}
pdbs -= set_pdbs
set_pdbs |= pdbs # merge sets
list_pdbs += [os.path.join(pdb_path, f) for f in pdbs]
total_chains = 0
os.makedirs(CHAINS_DIR, exist_ok=True) # folders for preprocessed data
with open(PREPRO_DIR+"chain_list_pdb.fasta", 'w') as chain_list_file:
for cur_iter, cur_pdb in enumerate(list_pdbs):
structure_name = cur_pdb.split('/')[-1][:-4].lower()
if cur_iter% 100 == 0:
print("%i/%i %s" % (cur_iter, len(list_pdbs), structure_name))
for chain_id, seq, res_pos in process_pdb(cur_pdb): # todo: _process_pdb(cur_pdb, gap_detect=False)
if len(seq) > 10:
total_chains += 1
chain_list_file.write(">"+structure_name+"."+chain_id+"\n")
chain_list_file.write(seq+"\n")
np.save(CHAINS_DIR+structure_name+"."+chain_id, res_pos)
print("Total chains:", total_chains) |