Spaces:

Songyou
/

LLM-fastAPI

Sleeping

App Files Files Community

Songyou commited on Jan 12, 2025

Commit

14be268

verified ·

1 Parent(s): 513314a

Upload combine_mol.py

Browse files

Files changed (1) hide show

combine_mol.py +146 -0

combine_mol.py ADDED Viewed

	@@ -0,0 +1,146 @@

+''' Common import and functions  '''
+import pandas as pd
+import numpy as np
+import seaborn as sns
+from matplotlib import pyplot as plt
+import os,sys
+import re
+import sqlite3
+from glob import glob
+from pathlib import Path
+import rdkit
+from rdkit import Chem, DataStructs
+from rdkit.Chem import Descriptors, rdMolDescriptors, AllChem, QED
+from rdkit.Chem import ChemicalFeatures
+from my_toolset.my_utils import get_mol,canonic_smiles
+from rdkit import RDConfig
+from functools import partial
+import numpy as np
+import re
+import argparse
+import copy
+def get_dummy_negb(atom):
+    ''' Get the neighbor index of the dummy atom '''
+    negb=atom.GetNeighbors()[0]
+    return int(negb.GetIdx())
+def bondLabel(smi):
+    pattern = r"\*:\d"
+    matches = re.findall(pattern, smi)
+    for imatch in set(matches):
+        imatch_sp=imatch.split(':')
+        newLabel=f"{imatch_sp[1]}*"
+        smi=smi.replace(imatch,newLabel)
+    return smi
+def connect_constVar(constSmi, varSmi, return_type='smiles'):
+    ''' Connect single R group to the core
+    '''
+    comboSmi=constSmi+'.'+varSmi
+    comboSmi=bondLabel(comboSmi)
+    # print(comboSmi)
+    combo_mol=get_mol(comboSmi)
+    # var_mol=Chem.MolFromSmiles(varSmi)  # the isotope of dummy atom is zero
+    # combo = Chem.CombineMols(const_mol, var_mol)
+    match = combo_mol.GetSubstructMatches(Chem.MolFromSmarts('[#0]')) ## detect the dummy atoms
+    # print(match)
+    combo_atoms=combo_mol.GetAtoms()
+    dummy_info=[[] for i in range(5)] # store the idx of connect dummy atoms
+    for imatch in match: # look through all the dummy atoms
+        atm_idx=imatch[0]
+        isotope=combo_atoms[atm_idx].GetIsotope()
+        dummy_negb=get_dummy_negb(combo_atoms[atm_idx])
+        dummy_info[isotope].append([atm_idx,isotope,dummy_negb])
+    #     if isotope in [0, Rsite]:
+    #         dummy_pair.append(atm_idx)
+    #         dummy_negb.append(get_dummy_negb(combo_atoms[atm_idx]))
+    # print(dummy_info)
+    edcombo = Chem.EditableMol(combo_mol)
+    dummyAtoms=[]
+    for idummyPair in dummy_info:
+        if len(idummyPair)==2:
+            edcombo.AddBond(idummyPair[0][2],idummyPair[1][2],order=Chem.rdchem.BondType.SINGLE)
+            dummyAtoms.append(idummyPair[0][0])
+            dummyAtoms.append(idummyPair[1][0])
+    dummyAtoms.sort(reverse=True)
+    for idummy in dummyAtoms:
+        edcombo.RemoveAtom(idummy)
+    combo = edcombo.GetMol()
+    ''' Replace dummy atom with hydrogen '''
+    products = Chem.ReplaceSubstructs(combo,Chem.MolFromSmarts('[#0]'),Chem.MolFromSmarts('[#1]'),replaceAll=True)
+    combo=products[0]
+    combo_smi=Chem.MolToSmiles(combo)  ## To remove the hydrogen
+    combo=Chem.MolFromSmiles(combo_smi)
+    combo=Chem.RemoveHs(combo)
+    if return_type=='mol':
+        return combo
+    if return_type=='smiles':
+        combo_smi=Chem.MolToSmiles(combo)
+        # print(combo_smi)
+        return combo_smi
+def connect_constVar_try(constSmi, varSmi, return_type='smiles'):
+    try:
+        fullSmi=connect_constVar(constSmi, varSmi, return_type='smiles')
+        return fullSmi
+    except:
+        return ''
+def get_completeMol(rootFolder, overwrite=False, unique=False):
+    if not overwrite and Path(f"{rootFolder}/generated_molecules_complete.csv").exists():
+        print('COMBINE MOL EXIST, SKIP COMBINATION!')
+        return
+    dfGen=pd.read_csv(f"{rootFolder}/generated_molecules.csv")  ## load generated compounds
+    for igen in range(1,10000):
+        if f"Predicted_smi_{igen}" not in dfGen.columns:
+            continue
+        # ires=[]
+        dfGen[f"Predicted_smi_{igen}"]=dfGen.apply(lambda x:connect_constVar_try(x['constantSMILES'],x[f"Predicted_smi_{igen}"]),axis=1)
+    dfGen["Source_Mol"]=dfGen.apply(lambda x:connect_constVar_try(x['constantSMILES'],x['fromVarSMILES']),axis=1)
+    dfGen["Target_Mol"]=dfGen.apply(lambda x:connect_constVar_try(x['constantSMILES'],x['toVarSMILES']),axis=1)
+    dfGen.to_csv(f"{rootFolder}/generated_molecules_complete.csv", index=None)
+    gen_list=[]
+    for idx,irow in dfGen.iterrows():
+        srcCPD=canonic_smiles(irow["Source_Mol"])
+        Delta_pki=re.findall(r'(\d+(?:\.\d+)?)', irow["Delta_Value"])
+        Delta_pki=[float(i) for i in Delta_pki]
+        Delta_pki=np.array(Delta_pki).mean()
+        for igen in range(1,10000):
+            if f"Predicted_smi_{igen}" not in dfGen.columns:
+                continue
+            # ires=[]
+            smi=irow[f"Predicted_smi_{igen}"]
+            # sourceSmi=canonic_smiles(irow['Source_Mol'])
+            if not pd.isna(smi):
+                smi=canonic_smiles(smi)
+                ires=[srcCPD,smi,Delta_pki]
+                gen_list.append(ires)
+    dfRes=pd.DataFrame(gen_list, columns=["Source_Mol","Gen_Mol",'Delta_pki'])
+    dfRes.sort_values(by="Delta_pki", ascending=False, inplace=True)
+    dfRes=dfRes.reindex()
+    if unique:
+        dfRes.drop_duplicates(subset=['Gen_Mol'],inplace=True)
+        print('removing dulplicated........')
+    print(f"Total {len(dfRes)} molecules have been generated.")
+    dfRes.sort_values(by="Delta_pki", ascending=False, inplace=True)
+    dfRes.to_csv(f"{rootFolder}/generated_collection.csv", index=None)
+def get_parser():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--rootFolder", help="the root folder to save the generated SMILES", required=True, default='')
+    parser.add_argument('--overwrite',type=bool, default=False,help='whether overwrite exist file')
+    parser.add_argument('--unique',type=bool, default=False,help='whether overwrite exist file')
+    args = parser.parse_args()
+    return args
+if __name__ == '__main__':
+    args = get_parser()
+    get_completeMol(args.rootFolder, args.overwrite, args.unique)