Spaces:

Songyou
/

LLM-fastAPI

Sleeping

File size: 5,874 Bytes

14be268

''' Common import and functions  '''
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
import os,sys
import re
import sqlite3
from glob import glob
from pathlib import Path
import rdkit
from rdkit import Chem, DataStructs
from rdkit.Chem import Descriptors, rdMolDescriptors, AllChem, QED
from rdkit.Chem import ChemicalFeatures
from my_toolset.my_utils import get_mol,canonic_smiles
from rdkit import RDConfig
from functools import partial
import numpy as np
import re
import argparse
import copy

def get_dummy_negb(atom):
    ''' Get the neighbor index of the dummy atom '''
    negb=atom.GetNeighbors()[0]
    return int(negb.GetIdx())

def bondLabel(smi):
    pattern = r"\*:\d"
    matches = re.findall(pattern, smi)
    for imatch in set(matches):
        imatch_sp=imatch.split(':')
        newLabel=f"{imatch_sp[1]}*"
        smi=smi.replace(imatch,newLabel)
    return smi

def connect_constVar(constSmi, varSmi, return_type='smiles'):
    ''' Connect single R group to the core

    '''
    comboSmi=constSmi+'.'+varSmi
    comboSmi=bondLabel(comboSmi)
    # print(comboSmi)
    combo_mol=get_mol(comboSmi)
    # var_mol=Chem.MolFromSmiles(varSmi)  # the isotope of dummy atom is zero
    # combo = Chem.CombineMols(const_mol, var_mol)
    match = combo_mol.GetSubstructMatches(Chem.MolFromSmarts('[#0]')) ## detect the dummy atoms
    # print(match)
    combo_atoms=combo_mol.GetAtoms()
    
    dummy_info=[[] for i in range(5)] # store the idx of connect dummy atoms
    for imatch in match: # look through all the dummy atoms
        atm_idx=imatch[0]
        isotope=combo_atoms[atm_idx].GetIsotope()
        dummy_negb=get_dummy_negb(combo_atoms[atm_idx])
        dummy_info[isotope].append([atm_idx,isotope,dummy_negb])
    #     if isotope in [0, Rsite]:
    #         dummy_pair.append(atm_idx)
    #         dummy_negb.append(get_dummy_negb(combo_atoms[atm_idx]))
    # print(dummy_info)
            
    edcombo = Chem.EditableMol(combo_mol)
    dummyAtoms=[]
    for idummyPair in dummy_info:
        if len(idummyPair)==2:
            edcombo.AddBond(idummyPair[0][2],idummyPair[1][2],order=Chem.rdchem.BondType.SINGLE) 
            dummyAtoms.append(idummyPair[0][0]) 
            dummyAtoms.append(idummyPair[1][0])  
    dummyAtoms.sort(reverse=True) 
    for idummy in dummyAtoms:
        edcombo.RemoveAtom(idummy)
    combo = edcombo.GetMol()
    ''' Replace dummy atom with hydrogen '''
    products = Chem.ReplaceSubstructs(combo,Chem.MolFromSmarts('[#0]'),Chem.MolFromSmarts('[#1]'),replaceAll=True)
    combo=products[0]
    combo_smi=Chem.MolToSmiles(combo)  ## To remove the hydrogen
    combo=Chem.MolFromSmiles(combo_smi) 
    combo=Chem.RemoveHs(combo)
    if return_type=='mol':
        return combo
    if return_type=='smiles':
        combo_smi=Chem.MolToSmiles(combo)
        # print(combo_smi)
        return combo_smi

def connect_constVar_try(constSmi, varSmi, return_type='smiles'):
    try:
        fullSmi=connect_constVar(constSmi, varSmi, return_type='smiles')
        return fullSmi
    except:
        return ''
    

def get_completeMol(rootFolder, overwrite=False, unique=False):
    if not overwrite and Path(f"{rootFolder}/generated_molecules_complete.csv").exists():
        print('COMBINE MOL EXIST, SKIP COMBINATION!')
        return
    dfGen=pd.read_csv(f"{rootFolder}/generated_molecules.csv")  ## load generated compounds
    for igen in range(1,10000):
        if f"Predicted_smi_{igen}" not in dfGen.columns:
            continue
        # ires=[]
        dfGen[f"Predicted_smi_{igen}"]=dfGen.apply(lambda x:connect_constVar_try(x['constantSMILES'],x[f"Predicted_smi_{igen}"]),axis=1)
    dfGen["Source_Mol"]=dfGen.apply(lambda x:connect_constVar_try(x['constantSMILES'],x['fromVarSMILES']),axis=1)
    dfGen["Target_Mol"]=dfGen.apply(lambda x:connect_constVar_try(x['constantSMILES'],x['toVarSMILES']),axis=1)
    dfGen.to_csv(f"{rootFolder}/generated_molecules_complete.csv", index=None)

    gen_list=[]
    for idx,irow in dfGen.iterrows():
        srcCPD=canonic_smiles(irow["Source_Mol"])
        Delta_pki=re.findall(r'(\d+(?:\.\d+)?)', irow["Delta_Value"])
        Delta_pki=[float(i) for i in Delta_pki]
        Delta_pki=np.array(Delta_pki).mean()
        for igen in range(1,10000):
            if f"Predicted_smi_{igen}" not in dfGen.columns:
                continue
            # ires=[]
            smi=irow[f"Predicted_smi_{igen}"]
            # sourceSmi=canonic_smiles(irow['Source_Mol'])
            if not pd.isna(smi):
                smi=canonic_smiles(smi)
                ires=[srcCPD,smi,Delta_pki]
                gen_list.append(ires)   
    dfRes=pd.DataFrame(gen_list, columns=["Source_Mol","Gen_Mol",'Delta_pki'])
    dfRes.sort_values(by="Delta_pki", ascending=False, inplace=True)
    dfRes=dfRes.reindex()
    if unique:
        dfRes.drop_duplicates(subset=['Gen_Mol'],inplace=True)
        print('removing dulplicated........')
    print(f"Total {len(dfRes)} molecules have been generated.")
    dfRes.sort_values(by="Delta_pki", ascending=False, inplace=True)
    dfRes.to_csv(f"{rootFolder}/generated_collection.csv", index=None)
    


def get_parser():
    parser = argparse.ArgumentParser()
    parser.add_argument("--rootFolder", help="the root folder to save the generated SMILES", required=True, default='')
    parser.add_argument('--overwrite',type=bool, default=False,help='whether overwrite exist file')
    parser.add_argument('--unique',type=bool, default=False,help='whether overwrite exist file')
    args = parser.parse_args()
    return args

if __name__ == '__main__':
    args = get_parser()

    get_completeMol(args.rootFolder, args.overwrite, args.unique)