Spaces:
Sleeping
Sleeping
| ''' Common import and functions ''' | |
| import pandas as pd | |
| import numpy as np | |
| import seaborn as sns | |
| from matplotlib import pyplot as plt | |
| import os,sys | |
| import re | |
| import sqlite3 | |
| from glob import glob | |
| from pathlib import Path | |
| import rdkit | |
| from rdkit import Chem, DataStructs | |
| from rdkit.Chem import Descriptors, rdMolDescriptors, AllChem, QED | |
| from rdkit.Chem import ChemicalFeatures | |
| from my_toolset.my_utils import get_mol,canonic_smiles | |
| from rdkit import RDConfig | |
| from functools import partial | |
| import numpy as np | |
| import re | |
| import argparse | |
| import copy | |
| def get_dummy_negb(atom): | |
| ''' Get the neighbor index of the dummy atom ''' | |
| negb=atom.GetNeighbors()[0] | |
| return int(negb.GetIdx()) | |
| def bondLabel(smi): | |
| pattern = r"\*:\d" | |
| matches = re.findall(pattern, smi) | |
| for imatch in set(matches): | |
| imatch_sp=imatch.split(':') | |
| newLabel=f"{imatch_sp[1]}*" | |
| smi=smi.replace(imatch,newLabel) | |
| return smi | |
| def connect_constVar(constSmi, varSmi, return_type='smiles'): | |
| ''' Connect single R group to the core | |
| ''' | |
| comboSmi=constSmi+'.'+varSmi | |
| comboSmi=bondLabel(comboSmi) | |
| # print(comboSmi) | |
| combo_mol=get_mol(comboSmi) | |
| # var_mol=Chem.MolFromSmiles(varSmi) # the isotope of dummy atom is zero | |
| # combo = Chem.CombineMols(const_mol, var_mol) | |
| match = combo_mol.GetSubstructMatches(Chem.MolFromSmarts('[#0]')) ## detect the dummy atoms | |
| # print(match) | |
| combo_atoms=combo_mol.GetAtoms() | |
| dummy_info=[[] for i in range(5)] # store the idx of connect dummy atoms | |
| for imatch in match: # look through all the dummy atoms | |
| atm_idx=imatch[0] | |
| isotope=combo_atoms[atm_idx].GetIsotope() | |
| dummy_negb=get_dummy_negb(combo_atoms[atm_idx]) | |
| dummy_info[isotope].append([atm_idx,isotope,dummy_negb]) | |
| # if isotope in [0, Rsite]: | |
| # dummy_pair.append(atm_idx) | |
| # dummy_negb.append(get_dummy_negb(combo_atoms[atm_idx])) | |
| # print(dummy_info) | |
| edcombo = Chem.EditableMol(combo_mol) | |
| dummyAtoms=[] | |
| for idummyPair in dummy_info: | |
| if len(idummyPair)==2: | |
| edcombo.AddBond(idummyPair[0][2],idummyPair[1][2],order=Chem.rdchem.BondType.SINGLE) | |
| dummyAtoms.append(idummyPair[0][0]) | |
| dummyAtoms.append(idummyPair[1][0]) | |
| dummyAtoms.sort(reverse=True) | |
| for idummy in dummyAtoms: | |
| edcombo.RemoveAtom(idummy) | |
| combo = edcombo.GetMol() | |
| ''' Replace dummy atom with hydrogen ''' | |
| products = Chem.ReplaceSubstructs(combo,Chem.MolFromSmarts('[#0]'),Chem.MolFromSmarts('[#1]'),replaceAll=True) | |
| combo=products[0] | |
| combo_smi=Chem.MolToSmiles(combo) ## To remove the hydrogen | |
| combo=Chem.MolFromSmiles(combo_smi) | |
| combo=Chem.RemoveHs(combo) | |
| if return_type=='mol': | |
| return combo | |
| if return_type=='smiles': | |
| combo_smi=Chem.MolToSmiles(combo) | |
| # print(combo_smi) | |
| return combo_smi | |
| def connect_constVar_try(constSmi, varSmi, return_type='smiles'): | |
| try: | |
| fullSmi=connect_constVar(constSmi, varSmi, return_type='smiles') | |
| return fullSmi | |
| except: | |
| return '' | |
| def get_completeMol(rootFolder, overwrite=False, unique=False): | |
| if not overwrite and Path(f"{rootFolder}/generated_molecules_complete.csv").exists(): | |
| print('COMBINE MOL EXIST, SKIP COMBINATION!') | |
| return | |
| dfGen=pd.read_csv(f"{rootFolder}/generated_molecules.csv") ## load generated compounds | |
| for igen in range(1,10000): | |
| if f"Predicted_smi_{igen}" not in dfGen.columns: | |
| continue | |
| # ires=[] | |
| dfGen[f"Predicted_smi_{igen}"]=dfGen.apply(lambda x:connect_constVar_try(x['constantSMILES'],x[f"Predicted_smi_{igen}"]),axis=1) | |
| dfGen["Source_Mol"]=dfGen.apply(lambda x:connect_constVar_try(x['constantSMILES'],x['fromVarSMILES']),axis=1) | |
| dfGen["Target_Mol"]=dfGen.apply(lambda x:connect_constVar_try(x['constantSMILES'],x['toVarSMILES']),axis=1) | |
| dfGen.to_csv(f"{rootFolder}/generated_molecules_complete.csv", index=None) | |
| gen_list=[] | |
| for idx,irow in dfGen.iterrows(): | |
| srcCPD=canonic_smiles(irow["Source_Mol"]) | |
| Delta_pki=re.findall(r'(\d+(?:\.\d+)?)', irow["Delta_Value"]) | |
| Delta_pki=[float(i) for i in Delta_pki] | |
| Delta_pki=np.array(Delta_pki).mean() | |
| for igen in range(1,10000): | |
| if f"Predicted_smi_{igen}" not in dfGen.columns: | |
| continue | |
| # ires=[] | |
| smi=irow[f"Predicted_smi_{igen}"] | |
| # sourceSmi=canonic_smiles(irow['Source_Mol']) | |
| if not pd.isna(smi): | |
| smi=canonic_smiles(smi) | |
| ires=[srcCPD,smi,Delta_pki] | |
| gen_list.append(ires) | |
| dfRes=pd.DataFrame(gen_list, columns=["Source_Mol","Gen_Mol",'Delta_pki']) | |
| dfRes.sort_values(by="Delta_pki", ascending=False, inplace=True) | |
| dfRes=dfRes.reindex() | |
| if unique: | |
| dfRes.drop_duplicates(subset=['Gen_Mol'],inplace=True) | |
| print('removing dulplicated........') | |
| print(f"Total {len(dfRes)} molecules have been generated.") | |
| dfRes.sort_values(by="Delta_pki", ascending=False, inplace=True) | |
| dfRes.to_csv(f"{rootFolder}/generated_collection.csv", index=None) | |
| def get_parser(): | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument("--rootFolder", help="the root folder to save the generated SMILES", required=True, default='') | |
| parser.add_argument('--overwrite',type=bool, default=False,help='whether overwrite exist file') | |
| parser.add_argument('--unique',type=bool, default=False,help='whether overwrite exist file') | |
| args = parser.parse_args() | |
| return args | |
| if __name__ == '__main__': | |
| args = get_parser() | |
| get_completeMol(args.rootFolder, args.overwrite, args.unique) | |