Spaces:

vvelda
/

SoluProtMutDemo

Build error

App Files Files Community

SoluProtMutDemo / code /data_preprocessing /process.py

vvelda

Improvements

3068eb3 verified 8 months ago

raw

history blame contribute delete

4.27 kB

	# author: Jan Velecky, adapted from Pedro
	import os
	import numpy as np
	import gzip
	import pickle

	ROOT_DIR = os.path.dirname(__file__) + '/../../'
	RAW_DIR = ROOT_DIR + '/data_raw/pdbs/'
	OPTIMS_DIR = ROOT_DIR + '/data_temp/optimized-pdbs/'
	MUTANTS_DIR = ROOT_DIR + '/data_temp/mutated-pdbs/'

	PREPRO_DIR = ROOT_DIR + '/data_preprocessed/'
	CHAINS_DIR = PREPRO_DIR + '/chains/'

	# should not contain multi-chains
	# should deal with partial residues
	# mutated residues should be in the structure

	import warnings
	warnings.filterwarnings("ignore")

	from Bio.PDB.PDBParser import PDBParser
	from Bio.PDB.Polypeptide import is_aa, three_to_one

	parser = PDBParser(PERMISSIVE=1)

	# a dict of {pos: aa} with some str compability
	class Type_Seq(dict):
	def __init__(self, arg, *kw):
	super().__init__(arg, *kw)

	def __str__(self): # directly convertible to amino-acid sequence string
	return ''.join(self.values())

	def __repr__(self):
	return str(dict(self))

	def __iter__(self): # dict default is over the keys
	yield from self.values()

	# def __getitem__(self, subscript): # slicing would be nice to have, but should it slice by indexes or as sequence? Who knows.
	# if isinstance(subscript, slice):
	# return self.items() print(subscript.start, subscript.stop, subscript.step)
	# else:
	# return dict[subscript]

	def copy(self): # sing super's (dict's) copy would create another dict, not a Type_Seq
	return Type_Seq(dict(self))


	def _process_pdb(filepath, gap_detect=True):
	structure_name = filepath.split('/')[-1][:-4]

	with open(filepath, 'rt') as ifh:
	structure = parser.get_structure(structure_name, ifh)

	for chain in structure[0]:
	res_aas = {} # sequence from the pdb
	res_pos = [] # positions of Cα
	seqid1 = None
	for residue in chain:
	if is_aa(residue.get_resname()):
	if 'CA' in residue:
	_, seqid, ins_code = residue.id
	if(gap_detect):
	if seqid1:
	gap = seqid - seqid1
	if gap > 8:
	print("Gap of %i AAs detected at %s" % (gap, str(residue.full_id)))
	seqid1 = seqid

	seqid = (str(seqid) + ins_code).strip() # remove ' ' for an empty insertion code
	try: # std amino acids
	res_aas[seqid] = three_to_one(residue.get_resname())
	except KeyError as e:
	res_aas[seqid] = 'X'
	atom_ca = residue['CA']
	res_pos.append(atom_ca.get_coord())

	res_pos = np.array(res_pos)

	yield chain.id, Type_Seq(res_aas), res_pos


	def process_pdb(filepath, chain: str = None):
	chains = _process_pdb(filepath)
	if not chain: # return generator (for all chains)
	return chains

	for ch in chains: # return the specified chain
	if ch[0] == chain:
	return ch[1:]


	def mutate_seq(res_aas: Type_Seq, wildtype: list, location: list, mutation: list) -> Type_Seq:
	res_aas = res_aas.copy()

	for w,l,m in zip(wildtype, location, mutation):
	if res_aas[l] != w:
	raise ValueError("Wildtype residue mismatch: %s is actually %s" % (''.join((w,str(l))), res_aas[l]))
	res_aas[l] = m

	return res_aas


	if __name__ == "__main__":

	list_pdbs = []
	set_pdbs = set()
	# process all PDBs
	for pdb_path in [OPTIMS_DIR, RAW_DIR, MUTANTS_DIR]:
	# use RAW structure if optimized not found
	pdbs = {f for f in os.listdir(pdb_path) if os.path.isfile(os.path.join(pdb_path, f))}
	pdbs -= set_pdbs
	set_pdbs \|= pdbs # merge sets
	list_pdbs += [os.path.join(pdb_path, f) for f in pdbs]

	total_chains = 0
	os.makedirs(CHAINS_DIR, exist_ok=True) # folders for preprocessed data

	with open(PREPRO_DIR+"chain_list_pdb.fasta", 'w') as chain_list_file:
	for cur_iter, cur_pdb in enumerate(list_pdbs):
	structure_name = cur_pdb.split('/')[-1][:-4].lower()

	if cur_iter% 100 == 0:
	print("%i/%i %s" % (cur_iter, len(list_pdbs), structure_name))

	for chain_id, seq, res_pos in process_pdb(cur_pdb): # todo: _process_pdb(cur_pdb, gap_detect=False)
	if len(seq) > 10:
	total_chains += 1

	chain_list_file.write(">"+structure_name+"."+chain_id+"\n")
	chain_list_file.write(seq+"\n")

	np.save(CHAINS_DIR+structure_name+"."+chain_id, res_pos)

	print("Total chains:", total_chains)