File size: 5,874 Bytes
14be268
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
''' Common import and functions  '''
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
import os,sys
import re
import sqlite3
from glob import glob
from pathlib import Path
import rdkit
from rdkit import Chem, DataStructs
from rdkit.Chem import Descriptors, rdMolDescriptors, AllChem, QED
from rdkit.Chem import ChemicalFeatures
from my_toolset.my_utils import get_mol,canonic_smiles
from rdkit import RDConfig
from functools import partial
import numpy as np
import re
import argparse
import copy

def get_dummy_negb(atom):
    ''' Get the neighbor index of the dummy atom '''
    negb=atom.GetNeighbors()[0]
    return int(negb.GetIdx())

def bondLabel(smi):
    pattern = r"\*:\d"
    matches = re.findall(pattern, smi)
    for imatch in set(matches):
        imatch_sp=imatch.split(':')
        newLabel=f"{imatch_sp[1]}*"
        smi=smi.replace(imatch,newLabel)
    return smi

def connect_constVar(constSmi, varSmi, return_type='smiles'):
    ''' Connect single R group to the core

    '''
    comboSmi=constSmi+'.'+varSmi
    comboSmi=bondLabel(comboSmi)
    # print(comboSmi)
    combo_mol=get_mol(comboSmi)
    # var_mol=Chem.MolFromSmiles(varSmi)  # the isotope of dummy atom is zero
    # combo = Chem.CombineMols(const_mol, var_mol)
    match = combo_mol.GetSubstructMatches(Chem.MolFromSmarts('[#0]')) ## detect the dummy atoms
    # print(match)
    combo_atoms=combo_mol.GetAtoms()
    
    dummy_info=[[] for i in range(5)] # store the idx of connect dummy atoms
    for imatch in match: # look through all the dummy atoms
        atm_idx=imatch[0]
        isotope=combo_atoms[atm_idx].GetIsotope()
        dummy_negb=get_dummy_negb(combo_atoms[atm_idx])
        dummy_info[isotope].append([atm_idx,isotope,dummy_negb])
    #     if isotope in [0, Rsite]:
    #         dummy_pair.append(atm_idx)
    #         dummy_negb.append(get_dummy_negb(combo_atoms[atm_idx]))
    # print(dummy_info)
            
    edcombo = Chem.EditableMol(combo_mol)
    dummyAtoms=[]
    for idummyPair in dummy_info:
        if len(idummyPair)==2:
            edcombo.AddBond(idummyPair[0][2],idummyPair[1][2],order=Chem.rdchem.BondType.SINGLE) 
            dummyAtoms.append(idummyPair[0][0]) 
            dummyAtoms.append(idummyPair[1][0])  
    dummyAtoms.sort(reverse=True) 
    for idummy in dummyAtoms:
        edcombo.RemoveAtom(idummy)
    combo = edcombo.GetMol()
    ''' Replace dummy atom with hydrogen '''
    products = Chem.ReplaceSubstructs(combo,Chem.MolFromSmarts('[#0]'),Chem.MolFromSmarts('[#1]'),replaceAll=True)
    combo=products[0]
    combo_smi=Chem.MolToSmiles(combo)  ## To remove the hydrogen
    combo=Chem.MolFromSmiles(combo_smi) 
    combo=Chem.RemoveHs(combo)
    if return_type=='mol':
        return combo
    if return_type=='smiles':
        combo_smi=Chem.MolToSmiles(combo)
        # print(combo_smi)
        return combo_smi

def connect_constVar_try(constSmi, varSmi, return_type='smiles'):
    try:
        fullSmi=connect_constVar(constSmi, varSmi, return_type='smiles')
        return fullSmi
    except:
        return ''
    

def get_completeMol(rootFolder, overwrite=False, unique=False):
    if not overwrite and Path(f"{rootFolder}/generated_molecules_complete.csv").exists():
        print('COMBINE MOL EXIST, SKIP COMBINATION!')
        return
    dfGen=pd.read_csv(f"{rootFolder}/generated_molecules.csv")  ## load generated compounds
    for igen in range(1,10000):
        if f"Predicted_smi_{igen}" not in dfGen.columns:
            continue
        # ires=[]
        dfGen[f"Predicted_smi_{igen}"]=dfGen.apply(lambda x:connect_constVar_try(x['constantSMILES'],x[f"Predicted_smi_{igen}"]),axis=1)
    dfGen["Source_Mol"]=dfGen.apply(lambda x:connect_constVar_try(x['constantSMILES'],x['fromVarSMILES']),axis=1)
    dfGen["Target_Mol"]=dfGen.apply(lambda x:connect_constVar_try(x['constantSMILES'],x['toVarSMILES']),axis=1)
    dfGen.to_csv(f"{rootFolder}/generated_molecules_complete.csv", index=None)

    gen_list=[]
    for idx,irow in dfGen.iterrows():
        srcCPD=canonic_smiles(irow["Source_Mol"])
        Delta_pki=re.findall(r'(\d+(?:\.\d+)?)', irow["Delta_Value"])
        Delta_pki=[float(i) for i in Delta_pki]
        Delta_pki=np.array(Delta_pki).mean()
        for igen in range(1,10000):
            if f"Predicted_smi_{igen}" not in dfGen.columns:
                continue
            # ires=[]
            smi=irow[f"Predicted_smi_{igen}"]
            # sourceSmi=canonic_smiles(irow['Source_Mol'])
            if not pd.isna(smi):
                smi=canonic_smiles(smi)
                ires=[srcCPD,smi,Delta_pki]
                gen_list.append(ires)   
    dfRes=pd.DataFrame(gen_list, columns=["Source_Mol","Gen_Mol",'Delta_pki'])
    dfRes.sort_values(by="Delta_pki", ascending=False, inplace=True)
    dfRes=dfRes.reindex()
    if unique:
        dfRes.drop_duplicates(subset=['Gen_Mol'],inplace=True)
        print('removing dulplicated........')
    print(f"Total {len(dfRes)} molecules have been generated.")
    dfRes.sort_values(by="Delta_pki", ascending=False, inplace=True)
    dfRes.to_csv(f"{rootFolder}/generated_collection.csv", index=None)
    


def get_parser():
    parser = argparse.ArgumentParser()
    parser.add_argument("--rootFolder", help="the root folder to save the generated SMILES", required=True, default='')
    parser.add_argument('--overwrite',type=bool, default=False,help='whether overwrite exist file')
    parser.add_argument('--unique',type=bool, default=False,help='whether overwrite exist file')
    args = parser.parse_args()
    return args

if __name__ == '__main__':
    args = get_parser()

    get_completeMol(args.rootFolder, args.overwrite, args.unique)