File size: 1,528 Bytes
dcacefd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
import sys
import os
sys.path.append("..")
import rdkit
import rdkit.Chem as Chem
import copy
import pickle
from tqdm.auto import tqdm
from protein_ligand import PDBProtein, parse_sdf_file


if __name__ == "__main__":
    NUM_ATOMS = [0, 5, 11, 8, 8, 6, 9, 9, 4, 10, 8, 8, 9, 8, 11, 7, 6, 7, 14, 12, 7]
    cnt = 0
    edit = 0
    num_res = 0
    index_path = '/data/zaixi/Pocket_Design/data/crossdocked_pocket10/index.pkl'
    raw_path = '/data/zaixi/Pocket_Design/data/crossdocked_pocket10'
    with open(index_path, 'rb') as f:
        index = pickle.load(f)
    for i, (pocket_fn, ligand_fn, _, rmsd_str) in enumerate(tqdm(index[20000:21000])):
        if pocket_fn is None: continue
        try:
            pdb_data = PDBProtein(os.path.join(raw_path, pocket_fn))
            pocket_dict = pdb_data.to_dict_atom()
            residue_dict = pdb_data.to_dict_residue()
            ligand_dict = parse_sdf_file(os.path.join(raw_path, ligand_fn))
            mask = pdb_data.query_residues_ligand(ligand_dict)
            for k, residue in enumerate(pdb_data.residues):
                if mask[k]:
                    assert len(residue['atoms']) == NUM_ATOMS[pdb_data.AA_NAME_NUMBER[residue['name']]]
            edit += mask.sum()
            num_res+= len(residue_dict['amino_acid'])
            cnt += 1
        except:
            continue

    # number of molecules and vocab
    print('Total number of molecules', cnt)
    print('average residues:', num_res / cnt)
    print('average editable residues:', edit / cnt)