Spaces:
Sleeping
Sleeping
File size: 5,874 Bytes
14be268 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 | ''' Common import and functions '''
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
import os,sys
import re
import sqlite3
from glob import glob
from pathlib import Path
import rdkit
from rdkit import Chem, DataStructs
from rdkit.Chem import Descriptors, rdMolDescriptors, AllChem, QED
from rdkit.Chem import ChemicalFeatures
from my_toolset.my_utils import get_mol,canonic_smiles
from rdkit import RDConfig
from functools import partial
import numpy as np
import re
import argparse
import copy
def get_dummy_negb(atom):
''' Get the neighbor index of the dummy atom '''
negb=atom.GetNeighbors()[0]
return int(negb.GetIdx())
def bondLabel(smi):
pattern = r"\*:\d"
matches = re.findall(pattern, smi)
for imatch in set(matches):
imatch_sp=imatch.split(':')
newLabel=f"{imatch_sp[1]}*"
smi=smi.replace(imatch,newLabel)
return smi
def connect_constVar(constSmi, varSmi, return_type='smiles'):
''' Connect single R group to the core
'''
comboSmi=constSmi+'.'+varSmi
comboSmi=bondLabel(comboSmi)
# print(comboSmi)
combo_mol=get_mol(comboSmi)
# var_mol=Chem.MolFromSmiles(varSmi) # the isotope of dummy atom is zero
# combo = Chem.CombineMols(const_mol, var_mol)
match = combo_mol.GetSubstructMatches(Chem.MolFromSmarts('[#0]')) ## detect the dummy atoms
# print(match)
combo_atoms=combo_mol.GetAtoms()
dummy_info=[[] for i in range(5)] # store the idx of connect dummy atoms
for imatch in match: # look through all the dummy atoms
atm_idx=imatch[0]
isotope=combo_atoms[atm_idx].GetIsotope()
dummy_negb=get_dummy_negb(combo_atoms[atm_idx])
dummy_info[isotope].append([atm_idx,isotope,dummy_negb])
# if isotope in [0, Rsite]:
# dummy_pair.append(atm_idx)
# dummy_negb.append(get_dummy_negb(combo_atoms[atm_idx]))
# print(dummy_info)
edcombo = Chem.EditableMol(combo_mol)
dummyAtoms=[]
for idummyPair in dummy_info:
if len(idummyPair)==2:
edcombo.AddBond(idummyPair[0][2],idummyPair[1][2],order=Chem.rdchem.BondType.SINGLE)
dummyAtoms.append(idummyPair[0][0])
dummyAtoms.append(idummyPair[1][0])
dummyAtoms.sort(reverse=True)
for idummy in dummyAtoms:
edcombo.RemoveAtom(idummy)
combo = edcombo.GetMol()
''' Replace dummy atom with hydrogen '''
products = Chem.ReplaceSubstructs(combo,Chem.MolFromSmarts('[#0]'),Chem.MolFromSmarts('[#1]'),replaceAll=True)
combo=products[0]
combo_smi=Chem.MolToSmiles(combo) ## To remove the hydrogen
combo=Chem.MolFromSmiles(combo_smi)
combo=Chem.RemoveHs(combo)
if return_type=='mol':
return combo
if return_type=='smiles':
combo_smi=Chem.MolToSmiles(combo)
# print(combo_smi)
return combo_smi
def connect_constVar_try(constSmi, varSmi, return_type='smiles'):
try:
fullSmi=connect_constVar(constSmi, varSmi, return_type='smiles')
return fullSmi
except:
return ''
def get_completeMol(rootFolder, overwrite=False, unique=False):
if not overwrite and Path(f"{rootFolder}/generated_molecules_complete.csv").exists():
print('COMBINE MOL EXIST, SKIP COMBINATION!')
return
dfGen=pd.read_csv(f"{rootFolder}/generated_molecules.csv") ## load generated compounds
for igen in range(1,10000):
if f"Predicted_smi_{igen}" not in dfGen.columns:
continue
# ires=[]
dfGen[f"Predicted_smi_{igen}"]=dfGen.apply(lambda x:connect_constVar_try(x['constantSMILES'],x[f"Predicted_smi_{igen}"]),axis=1)
dfGen["Source_Mol"]=dfGen.apply(lambda x:connect_constVar_try(x['constantSMILES'],x['fromVarSMILES']),axis=1)
dfGen["Target_Mol"]=dfGen.apply(lambda x:connect_constVar_try(x['constantSMILES'],x['toVarSMILES']),axis=1)
dfGen.to_csv(f"{rootFolder}/generated_molecules_complete.csv", index=None)
gen_list=[]
for idx,irow in dfGen.iterrows():
srcCPD=canonic_smiles(irow["Source_Mol"])
Delta_pki=re.findall(r'(\d+(?:\.\d+)?)', irow["Delta_Value"])
Delta_pki=[float(i) for i in Delta_pki]
Delta_pki=np.array(Delta_pki).mean()
for igen in range(1,10000):
if f"Predicted_smi_{igen}" not in dfGen.columns:
continue
# ires=[]
smi=irow[f"Predicted_smi_{igen}"]
# sourceSmi=canonic_smiles(irow['Source_Mol'])
if not pd.isna(smi):
smi=canonic_smiles(smi)
ires=[srcCPD,smi,Delta_pki]
gen_list.append(ires)
dfRes=pd.DataFrame(gen_list, columns=["Source_Mol","Gen_Mol",'Delta_pki'])
dfRes.sort_values(by="Delta_pki", ascending=False, inplace=True)
dfRes=dfRes.reindex()
if unique:
dfRes.drop_duplicates(subset=['Gen_Mol'],inplace=True)
print('removing dulplicated........')
print(f"Total {len(dfRes)} molecules have been generated.")
dfRes.sort_values(by="Delta_pki", ascending=False, inplace=True)
dfRes.to_csv(f"{rootFolder}/generated_collection.csv", index=None)
def get_parser():
parser = argparse.ArgumentParser()
parser.add_argument("--rootFolder", help="the root folder to save the generated SMILES", required=True, default='')
parser.add_argument('--overwrite',type=bool, default=False,help='whether overwrite exist file')
parser.add_argument('--unique',type=bool, default=False,help='whether overwrite exist file')
args = parser.parse_args()
return args
if __name__ == '__main__':
args = get_parser()
get_completeMol(args.rootFolder, args.overwrite, args.unique)
|