File size: 4,274 Bytes
b140e2c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3068eb3
b140e2c
 
 
 
 
 
 
3068eb3
b140e2c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3068eb3
b140e2c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
# author: Jan Velecky, adapted from Pedro
import os
import numpy as np
import gzip
import pickle

ROOT_DIR    = os.path.dirname(__file__) + '/../../'
RAW_DIR  = ROOT_DIR + '/data_raw/pdbs/'
OPTIMS_DIR  = ROOT_DIR + '/data_temp/optimized-pdbs/'
MUTANTS_DIR = ROOT_DIR + '/data_temp/mutated-pdbs/'

PREPRO_DIR  = ROOT_DIR + '/data_preprocessed/'
CHAINS_DIR  = PREPRO_DIR + '/chains/'

# should not contain multi-chains
# should deal with partial residues
# mutated residues should be in the structure

import warnings
warnings.filterwarnings("ignore")

from Bio.PDB.PDBParser import PDBParser
from Bio.PDB.Polypeptide import is_aa, three_to_one

parser = PDBParser(PERMISSIVE=1)

# a dict of {pos: aa} with some str compability
class Type_Seq(dict):
	def __init__(self, *arg, **kw):
		super().__init__(*arg, **kw)
	
	def __str__(self): # directly convertible to amino-acid sequence string
		return ''.join(self.values())
	
	def __repr__(self):
		return str(dict(self))
	
	def __iter__(self): # dict default is over the keys
		yield from self.values()
	
	# def __getitem__(self, subscript): # slicing would be nice to have, but should it slice by indexes or as sequence? Who knows.
		# if isinstance(subscript, slice):
			# return self.items() print(subscript.start, subscript.stop, subscript.step)
		# else:
			# return dict[subscript]
	
	def copy(self): # sing super's (dict's) copy would create another dict, not a Type_Seq
		return Type_Seq(dict(self))


def _process_pdb(filepath, gap_detect=True):
	structure_name = filepath.split('/')[-1][:-4]

	with open(filepath, 'rt') as ifh:
		structure = parser.get_structure(structure_name, ifh)

		for chain in structure[0]:
			res_aas = {} # sequence from the pdb
			res_pos = [] # positions of Cα
			seqid1 = None
			for residue in chain:
				if is_aa(residue.get_resname()):
					if 'CA' in residue:
						_, seqid, ins_code = residue.id
						if(gap_detect):
							if seqid1:
								gap = seqid - seqid1
								if gap > 8:
									print("Gap of %i AAs detected at %s" % (gap, str(residue.full_id)))
							seqid1 = seqid

						seqid = (str(seqid) + ins_code).strip() # remove ' ' for an empty insertion code
						try: # std amino acids
							res_aas[seqid] = three_to_one(residue.get_resname())
						except KeyError as e:
							res_aas[seqid] = 'X'
						atom_ca = residue['CA']
						res_pos.append(atom_ca.get_coord())

			res_pos = np.array(res_pos)
			
			yield chain.id, Type_Seq(res_aas), res_pos


def process_pdb(filepath, chain: str = None):
	chains = _process_pdb(filepath)
	if not chain: # return generator (for all chains)
		return chains
		
	for ch in chains: # return the specified chain
		if ch[0] == chain:
			return ch[1:]


def mutate_seq(res_aas: Type_Seq, wildtype: list, location: list, mutation: list) -> Type_Seq:
	res_aas = res_aas.copy()
	
	for w,l,m in zip(wildtype, location, mutation):
		if res_aas[l] != w:
			raise ValueError("Wildtype residue mismatch: %s is actually %s" % (''.join((w,str(l))), res_aas[l]))
		res_aas[l] = m
	
	return res_aas


if __name__ == "__main__":

	list_pdbs = []
	set_pdbs = set()
	# process all PDBs
	for pdb_path in [OPTIMS_DIR, RAW_DIR, MUTANTS_DIR]:
		# use RAW structure if optimized not found
		pdbs = {f for f in os.listdir(pdb_path) if os.path.isfile(os.path.join(pdb_path, f))}
		pdbs -= set_pdbs
		set_pdbs |= pdbs # merge sets
		list_pdbs += [os.path.join(pdb_path, f) for f in pdbs]

	total_chains = 0
	os.makedirs(CHAINS_DIR, exist_ok=True) # folders for preprocessed data
	
	with open(PREPRO_DIR+"chain_list_pdb.fasta", 'w') as chain_list_file:
		for cur_iter, cur_pdb in enumerate(list_pdbs):
			structure_name = cur_pdb.split('/')[-1][:-4].lower()
			
			if cur_iter% 100 == 0:
				print("%i/%i %s" % (cur_iter, len(list_pdbs), structure_name))
			
			for chain_id, seq, res_pos in process_pdb(cur_pdb): # todo: _process_pdb(cur_pdb, gap_detect=False)
				if len(seq) > 10:
					total_chains += 1

					chain_list_file.write(">"+structure_name+"."+chain_id+"\n")
					chain_list_file.write(seq+"\n")
				
					np.save(CHAINS_DIR+structure_name+"."+chain_id, res_pos)

	print("Total chains:", total_chains)