Songyou commited on
Commit
14be268
·
verified ·
1 Parent(s): 513314a

Upload combine_mol.py

Browse files
Files changed (1) hide show
  1. combine_mol.py +146 -0
combine_mol.py ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ''' Common import and functions '''
2
+ import pandas as pd
3
+ import numpy as np
4
+ import seaborn as sns
5
+ from matplotlib import pyplot as plt
6
+ import os,sys
7
+ import re
8
+ import sqlite3
9
+ from glob import glob
10
+ from pathlib import Path
11
+ import rdkit
12
+ from rdkit import Chem, DataStructs
13
+ from rdkit.Chem import Descriptors, rdMolDescriptors, AllChem, QED
14
+ from rdkit.Chem import ChemicalFeatures
15
+ from my_toolset.my_utils import get_mol,canonic_smiles
16
+ from rdkit import RDConfig
17
+ from functools import partial
18
+ import numpy as np
19
+ import re
20
+ import argparse
21
+ import copy
22
+
23
+ def get_dummy_negb(atom):
24
+ ''' Get the neighbor index of the dummy atom '''
25
+ negb=atom.GetNeighbors()[0]
26
+ return int(negb.GetIdx())
27
+
28
+ def bondLabel(smi):
29
+ pattern = r"\*:\d"
30
+ matches = re.findall(pattern, smi)
31
+ for imatch in set(matches):
32
+ imatch_sp=imatch.split(':')
33
+ newLabel=f"{imatch_sp[1]}*"
34
+ smi=smi.replace(imatch,newLabel)
35
+ return smi
36
+
37
+ def connect_constVar(constSmi, varSmi, return_type='smiles'):
38
+ ''' Connect single R group to the core
39
+ '''
40
+ comboSmi=constSmi+'.'+varSmi
41
+ comboSmi=bondLabel(comboSmi)
42
+ # print(comboSmi)
43
+ combo_mol=get_mol(comboSmi)
44
+ # var_mol=Chem.MolFromSmiles(varSmi) # the isotope of dummy atom is zero
45
+ # combo = Chem.CombineMols(const_mol, var_mol)
46
+ match = combo_mol.GetSubstructMatches(Chem.MolFromSmarts('[#0]')) ## detect the dummy atoms
47
+ # print(match)
48
+ combo_atoms=combo_mol.GetAtoms()
49
+
50
+ dummy_info=[[] for i in range(5)] # store the idx of connect dummy atoms
51
+ for imatch in match: # look through all the dummy atoms
52
+ atm_idx=imatch[0]
53
+ isotope=combo_atoms[atm_idx].GetIsotope()
54
+ dummy_negb=get_dummy_negb(combo_atoms[atm_idx])
55
+ dummy_info[isotope].append([atm_idx,isotope,dummy_negb])
56
+ # if isotope in [0, Rsite]:
57
+ # dummy_pair.append(atm_idx)
58
+ # dummy_negb.append(get_dummy_negb(combo_atoms[atm_idx]))
59
+ # print(dummy_info)
60
+
61
+ edcombo = Chem.EditableMol(combo_mol)
62
+ dummyAtoms=[]
63
+ for idummyPair in dummy_info:
64
+ if len(idummyPair)==2:
65
+ edcombo.AddBond(idummyPair[0][2],idummyPair[1][2],order=Chem.rdchem.BondType.SINGLE)
66
+ dummyAtoms.append(idummyPair[0][0])
67
+ dummyAtoms.append(idummyPair[1][0])
68
+ dummyAtoms.sort(reverse=True)
69
+ for idummy in dummyAtoms:
70
+ edcombo.RemoveAtom(idummy)
71
+ combo = edcombo.GetMol()
72
+ ''' Replace dummy atom with hydrogen '''
73
+ products = Chem.ReplaceSubstructs(combo,Chem.MolFromSmarts('[#0]'),Chem.MolFromSmarts('[#1]'),replaceAll=True)
74
+ combo=products[0]
75
+ combo_smi=Chem.MolToSmiles(combo) ## To remove the hydrogen
76
+ combo=Chem.MolFromSmiles(combo_smi)
77
+ combo=Chem.RemoveHs(combo)
78
+ if return_type=='mol':
79
+ return combo
80
+ if return_type=='smiles':
81
+ combo_smi=Chem.MolToSmiles(combo)
82
+ # print(combo_smi)
83
+ return combo_smi
84
+
85
+ def connect_constVar_try(constSmi, varSmi, return_type='smiles'):
86
+ try:
87
+ fullSmi=connect_constVar(constSmi, varSmi, return_type='smiles')
88
+ return fullSmi
89
+ except:
90
+ return ''
91
+
92
+
93
+ def get_completeMol(rootFolder, overwrite=False, unique=False):
94
+ if not overwrite and Path(f"{rootFolder}/generated_molecules_complete.csv").exists():
95
+ print('COMBINE MOL EXIST, SKIP COMBINATION!')
96
+ return
97
+ dfGen=pd.read_csv(f"{rootFolder}/generated_molecules.csv") ## load generated compounds
98
+ for igen in range(1,10000):
99
+ if f"Predicted_smi_{igen}" not in dfGen.columns:
100
+ continue
101
+ # ires=[]
102
+ dfGen[f"Predicted_smi_{igen}"]=dfGen.apply(lambda x:connect_constVar_try(x['constantSMILES'],x[f"Predicted_smi_{igen}"]),axis=1)
103
+ dfGen["Source_Mol"]=dfGen.apply(lambda x:connect_constVar_try(x['constantSMILES'],x['fromVarSMILES']),axis=1)
104
+ dfGen["Target_Mol"]=dfGen.apply(lambda x:connect_constVar_try(x['constantSMILES'],x['toVarSMILES']),axis=1)
105
+ dfGen.to_csv(f"{rootFolder}/generated_molecules_complete.csv", index=None)
106
+
107
+ gen_list=[]
108
+ for idx,irow in dfGen.iterrows():
109
+ srcCPD=canonic_smiles(irow["Source_Mol"])
110
+ Delta_pki=re.findall(r'(\d+(?:\.\d+)?)', irow["Delta_Value"])
111
+ Delta_pki=[float(i) for i in Delta_pki]
112
+ Delta_pki=np.array(Delta_pki).mean()
113
+ for igen in range(1,10000):
114
+ if f"Predicted_smi_{igen}" not in dfGen.columns:
115
+ continue
116
+ # ires=[]
117
+ smi=irow[f"Predicted_smi_{igen}"]
118
+ # sourceSmi=canonic_smiles(irow['Source_Mol'])
119
+ if not pd.isna(smi):
120
+ smi=canonic_smiles(smi)
121
+ ires=[srcCPD,smi,Delta_pki]
122
+ gen_list.append(ires)
123
+ dfRes=pd.DataFrame(gen_list, columns=["Source_Mol","Gen_Mol",'Delta_pki'])
124
+ dfRes.sort_values(by="Delta_pki", ascending=False, inplace=True)
125
+ dfRes=dfRes.reindex()
126
+ if unique:
127
+ dfRes.drop_duplicates(subset=['Gen_Mol'],inplace=True)
128
+ print('removing dulplicated........')
129
+ print(f"Total {len(dfRes)} molecules have been generated.")
130
+ dfRes.sort_values(by="Delta_pki", ascending=False, inplace=True)
131
+ dfRes.to_csv(f"{rootFolder}/generated_collection.csv", index=None)
132
+
133
+
134
+
135
+ def get_parser():
136
+ parser = argparse.ArgumentParser()
137
+ parser.add_argument("--rootFolder", help="the root folder to save the generated SMILES", required=True, default='')
138
+ parser.add_argument('--overwrite',type=bool, default=False,help='whether overwrite exist file')
139
+ parser.add_argument('--unique',type=bool, default=False,help='whether overwrite exist file')
140
+ args = parser.parse_args()
141
+ return args
142
+
143
+ if __name__ == '__main__':
144
+ args = get_parser()
145
+
146
+ get_completeMol(args.rootFolder, args.overwrite, args.unique)