| from collections import defaultdict |
|
|
| from admet_ai import ADMETModel |
| import networkx as nx |
|
|
| import rdkit |
| from rdkit import Chem, DataStructs |
| from rdkit.Chem import AllChem, Descriptors |
| import rdkit.Chem.QED as QED |
| from rdkit import RDLogger |
| RDLogger.DisableLog('rdApp.*') |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| from rdkit.Chem import rdMolDescriptors |
| import pickle |
| import math |
| import os.path as op |
|
|
| _fscores = None |
|
|
|
|
| def readFragmentScores(name='fpscores'): |
| import gzip |
| global _fscores |
| |
| if name == "fpscores": |
| name = op.join(op.dirname(__file__), name) |
| _fscores = pickle.load(gzip.open('%s.pkl.gz' % name)) |
| outDict = {} |
| for i in _fscores: |
| for j in range(1, len(i)): |
| outDict[i[j]] = float(i[0]) |
| _fscores = outDict |
|
|
|
|
| def numBridgeheadsAndSpiro(mol, ri=None): |
| nSpiro = rdMolDescriptors.CalcNumSpiroAtoms(mol) |
| nBridgehead = rdMolDescriptors.CalcNumBridgeheadAtoms(mol) |
| return nBridgehead, nSpiro |
|
|
|
|
| def calculateScore(m): |
| if _fscores is None: |
| readFragmentScores() |
|
|
| |
| fp = rdMolDescriptors.GetMorganFingerprint(m, |
| 2) |
| fps = fp.GetNonzeroElements() |
| score1 = 0. |
| nf = 1e-6 |
| for bitId, v in fps.items(): |
| nf += v |
| sfp = bitId |
| score1 += _fscores.get(sfp, -4) * v |
| score1 /= nf |
|
|
| |
| nAtoms = m.GetNumAtoms() |
| nChiralCenters = len(Chem.FindMolChiralCenters(m, includeUnassigned=True)) |
| ri = m.GetRingInfo() |
| nBridgeheads, nSpiro = numBridgeheadsAndSpiro(m, ri) |
| nMacrocycles = 0 |
| for x in ri.AtomRings(): |
| if len(x) > 8: |
| nMacrocycles += 1 |
|
|
| sizePenalty = nAtoms**1.005 - nAtoms |
| stereoPenalty = math.log10(nChiralCenters + 1) |
| spiroPenalty = math.log10(nSpiro + 1) |
| bridgePenalty = math.log10(nBridgeheads + 1) |
| macrocyclePenalty = 0. |
| |
| |
| |
| |
| if nMacrocycles > 0: |
| macrocyclePenalty = math.log10(2) |
|
|
| score2 = 0. - sizePenalty - stereoPenalty - spiroPenalty - bridgePenalty - macrocyclePenalty |
|
|
| |
| |
| |
| score3 = 0. |
| if nAtoms > len(fps): |
| score3 = math.log(float(nAtoms) / len(fps)) * .5 |
|
|
| sascore = score1 + score2 + score3 |
|
|
| |
| min = -4.0 |
| max = 2.5 |
| sascore = 11. - (sascore - min + 1) / (max - min) * 9. |
| |
| if sascore > 8.: |
| sascore = 8. + math.log(sascore + 1. - 9.) |
| if sascore > 10.: |
| sascore = 10.0 |
| elif sascore < 1.: |
| sascore = 1.0 |
|
|
| return sascore |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
|
|
| |
| import numpy as np |
| from rdkit import rdBase |
| from sklearn import svm |
| import sys |
| |
| sys.modules['sklearn.svm.classes'] = svm |
| import re |
| rdBase.DisableLog('rdApp.error') |
|
|
| """Scores based on an ECFP classifier for activity.""" |
|
|
| clf_model = None |
| def load_model(): |
| global clf_model |
| name = op.join(op.dirname(__file__), 'clf_py36.pkl') |
| with open(name, "rb") as f: |
| clf_model = pickle.load(f) |
|
|
| def get_score(smile): |
| if clf_model is None: |
| load_model() |
|
|
| mol = Chem.MolFromSmiles(smile) |
| if mol: |
| fp = fingerprints_from_mol(mol) |
| score = clf_model.predict_proba(fp)[:, 1] |
| return float(score) |
| return 0.0 |
|
|
| def fingerprints_from_mol(mol): |
| fp = AllChem.GetMorganFingerprint(mol, 3, useCounts=True, useFeatures=True) |
| size = 2048 |
| nfp = np.zeros((1, size), np.int32) |
| for idx,v in fp.GetNonzeroElements().items(): |
| nidx = idx%size |
| nfp[0, nidx] += int(v) |
| return nfp |
|
|
|
|
| import pandas as pd |
|
|
|
|
| def similarity(a, b): |
| if a is None or b is None: |
| return 0.0 |
| amol = Chem.MolFromSmiles(a) |
| bmol = Chem.MolFromSmiles(b) |
| if amol is None or bmol is None: |
| return 0.0 |
|
|
| fp1 = AllChem.GetMorganFingerprintAsBitVect(amol, 2, nBits=2048, useChirality=False) |
| fp2 = AllChem.GetMorganFingerprintAsBitVect(bmol, 2, nBits=2048, useChirality=False) |
| return DataStructs.TanimotoSimilarity(fp1, fp2) |
|
|
| def drd2(s): |
| if s is None: return 0.0 |
| if Chem.MolFromSmiles(s) is None: |
| return 0.0 |
| |
| return get_score(s) |
|
|
| def qed(s): |
| if s is None: return 0.0 |
| mol = Chem.MolFromSmiles(s) |
| if mol is None: return 0.0 |
| return QED.qed(mol) |
|
|
| def sas(s): |
| if s is None: return 0.0 |
| mol = Chem.MolFromSmiles(s) |
| if mol is None: return 0.0 |
| |
| return calculateScore(mol) |
|
|
| |
| def penalized_logp(s): |
| if s is None: return -100.0 |
| mol = Chem.MolFromSmiles(s) |
| if mol is None: return -100.0 |
|
|
| logP_mean = 2.4570953396190123 |
| logP_std = 1.434324401111988 |
| SA_mean = -3.0525811293166134 |
| SA_std = 0.8335207024513095 |
| cycle_mean = -0.0485696876403053 |
| cycle_std = 0.2860212110245455 |
|
|
| log_p = Descriptors.MolLogP(mol) |
| |
| SA = -calculateScore(mol) |
|
|
| |
| cycle_list = nx.cycle_basis(nx.Graph(Chem.rdmolops.GetAdjacencyMatrix(mol))) |
| if len(cycle_list) == 0: |
| cycle_length = 0 |
| else: |
| cycle_length = max([len(j) for j in cycle_list]) |
| if cycle_length <= 6: |
| cycle_length = 0 |
| else: |
| cycle_length = cycle_length - 6 |
| cycle_score = -cycle_length |
|
|
| normalized_log_p = (log_p - logP_mean) / logP_std |
| normalized_SA = (SA - SA_mean) / SA_std |
| normalized_cycle = (cycle_score - cycle_mean) / cycle_std |
| return normalized_log_p + normalized_SA + normalized_cycle |
|
|
| def smiles2D(s): |
| mol = Chem.MolFromSmiles(s) |
| return Chem.MolToSmiles(mol) |
|
|
|
|
| |
| def canonicalize_smiles(smiles): |
| try: |
| mol = Chem.MolFromSmiles(smiles) |
| return Chem.MolToSmiles(mol, canonical=True) |
| except: |
| return None |
|
|
| def generate_props(smiles): |
| """ |
| Generates properties using the ADMET model |
| :param smiles: list(list(str)) |
| :return props_df: pandas df |
| """ |
|
|
| smi = list() |
| for i in smiles: |
| smi.extend(i) |
| |
| |
| smi = set(smi) |
| print(f"Number of preprocessed SMILES: {len(smi)}") |
| smi = [canonicalize_smiles(s) for s in smi] |
| smi = [s for s in smi if s is not None] |
| print(f"Number of postprocessed SMILES: {len(smiles)}") |
| |
| model = ADMETModel(num_workers=4) |
| props_df = model.predict(smi) |
| props_df = pd.DataFrame(props_df) |
| props_df.reset_index(inplace=True) |
| |
| props_df.rename(columns={'index': 'smiles', 'BBB_Martins': 'bbbp', 'AMES': 'mutagenicity', 'HIA_Hou': 'hia'}, inplace=True) |
| props_df = props_df[['smiles', 'bbbp', 'mutagenicity', 'hia']].round(2) |
|
|
|
|
| |
| props_df.set_index('smiles', inplace=True) |
|
|
| props = defaultdict(dict) |
| for s in smi: |
| for col in props_df.columns: |
| props[s][col] = props_df.loc[s][col] |
|
|
| if 'plogp' not in props[s]: |
| props[s]['plogp'] = penalized_logp(s) |
| if 'qed' not in props[s]: |
| props[s]['qed'] = qed(s) |
| |
| |
| if 'sas' not in props[s]: |
| props[s]['sas'] = sas(s) |
|
|
| |
| props_df = pd.DataFrame(props).T |
| props_df = props_df.reset_index().rename(columns={'index': 'smiles', |
| 'BBBP': 'bbbp', 'AMES': 'mutagenicity', |
| 'HIA_Hou': 'hia'}) |
| props_df = props_df.round(2) |
|
|
| return props_df |
| |
|
|
|
|
| def pair_similarity(amol, bmol, sim_type): |
| if amol is None or bmol is None: |
| return 0.0 |
|
|
| if isinstance(amol, str): |
| amol = Chem.MolFromSmiles(amol) |
| if isinstance(bmol, str): |
| bmol = Chem.MolFromSmiles(bmol) |
| if amol is None or bmol is None: |
| return 0.0 |
|
|
| if sim_type == "binary": |
| fp1 = AllChem.GetMorganFingerprintAsBitVect(amol, 2, nBits=2048, useChirality=False) |
| fp2 = AllChem.GetMorganFingerprintAsBitVect(bmol, 2, nBits=2048, useChirality=False) |
| else: |
| fp1 = AllChem.GetMorganFingerprint(amol, 2, useChirality=False) |
| fp2 = AllChem.GetMorganFingerprint(bmol, 2, useChirality=False) |
|
|
| sim = DataStructs.TanimotoSimilarity(fp1, fp2) |
| |
| return sim |
|
|
| def compute_FP_sim(smiles): |
| |
| mols = [] |
| for smi in smiles: |
| mol = Chem.MolFromSmiles(smi) |
| if mol: |
| mols.append(mol) |
| |
| sim = np.zeros((len(mols), len(mols))) |
| for i in range(len(mols)): |
| for j in range(i, len(mols)): |
| sim[i, j] = sim[j, i] = pair_similarity(mols[i], mols[j], "binary") |
| return sim |
|
|
| def normalize_prop(val, min_val, max_val): |
| normalized_val = (val - min_val) / (max_val - min_val) |
| return max(0, min(1, normalized_val)) |
|
|
| def find_best_optimized(input_prop, output_prop, property): |
| """ |
| Find the best optimized SMILES that satisfies constraints and maximizes improvement. |
| Args: |
| input_prop (dict): Properties of the input molecule. |
| output_prop (dict): Properties of the optimized molecules. |
| property (list): List of properties to optimize. |
| |
| Returns: |
| int: Index of the best optimized molecule. |
| """ |
| best_idx = None |
| best_improvement = -1 |
| num_candidates = len(output_prop[property[0]]) |
|
|
| for i in range(num_candidates): |
| improvement = 0 |
| |
| wrong_direction = False |
| for prop in property: |
| input_val = input_prop[prop] |
| output_val = output_prop[prop][i] |
| |
| if prop == "mutagenicity": |
| if output_val >= input_val: |
| wrong_direction = True |
| break |
| else: |
| if output_val <= input_val: |
| wrong_direction = True |
| break |
|
|
| |
| if wrong_direction: |
| continue |
|
|
| for prop in property: |
| input_val = input_prop[prop] |
| output_val = output_prop[prop][i] |
| |
| if input_val == 0: |
| improvement += output_val |
| else: |
| improvement += (output_val - input_val) / abs(input_val) if prop != 'mutagenicity' else (input_val - output_val) / abs(input_val) |
| |
| |
| if improvement >= best_improvement: |
| best_improvement = improvement |
| best_idx = i |
| |
| |
| return best_idx |
|
|
|
|
| def compute_metrics(input_smiles, input_props, output_smiles, output_props_df, task, |
| normalize = {'plogp': (-20, 10)}): |
| """ |
| Compute the metrics for the optimization task. |
| Args: |
| input_smiles: list(str) |
| input_props: list(dict) |
| output_smiles: list(list(str)) |
| props_df: pandas df |
| task: str |
| """ |
| |
| |
| |
| |
| |
| |
|
|
| property = task.split('+') |
| output_props_df.set_index('smiles', inplace=True) |
|
|
| SR = diversity = 0 |
| SAS = [] |
| num_invalid = 0 |
| num_seen_smiles = 0 |
| most_optimized_smiles = [] |
| perc_change = defaultdict(list) |
| norm_perc_change = defaultdict(list) |
| avg_change = defaultdict(list) |
| avg_value = defaultdict(list) |
| composite_change = [] |
| norm_composite_change = [] |
| similarity_with_input = [] |
| num_inputs = len(input_smiles) |
|
|
| |
| most_optimized_smiles_props = [] |
| |
|
|
| |
| for i, opt_smiles in enumerate(output_smiles): |
| |
| if len(opt_smiles)==0: |
| num_invalid += 1 |
| continue |
| |
| |
| all_invalid_gen = True |
| output_props = defaultdict(list) |
| for smile in opt_smiles: |
| |
| |
| if smile not in output_props_df.index: |
| |
| |
| |
| continue |
| |
| all_invalid_gen = False |
| for prop in property: |
| output_props[prop].append(output_props_df.loc[smile, prop]) |
| output_props['sim'].append(pair_similarity(input_smiles[i], smile, "binary")) |
| |
| num_invalid += 1 if all_invalid_gen else 0 |
| |
| best_idx = find_best_optimized(input_props[i], output_props, property) |
| if best_idx is None: |
| continue |
| |
| all_props_desired = True |
| |
| for j, prop in enumerate(property): |
| |
| change = -1 if prop == 'mutagenicity' else 1 |
| if np.sign(output_props[prop][best_idx] - input_props[i][prop]) != np.sign(change): |
| all_props_desired = False |
| break |
|
|
| |
| |
| assert all_props_desired |
| |
| |
| |
| |
| |
| |
|
|
| if all_props_desired: |
| most_optimized_smiles.append(opt_smiles[best_idx]) |
| most_optimized_smiles_props.append({'opt_smiles': opt_smiles[best_idx], |
| 'source_smiles': input_smiles[i], |
| 'source_properties': input_props[i], |
| 'optimized_properties': {prop: output_props[prop][best_idx] for prop in property}}) |
| |
| |
| |
| similarity_with_input.append(output_props['sim'][best_idx]) |
| try: |
| SAS.append(sas(opt_smiles[best_idx])) |
| except: |
| pass |
| for prop in property: |
| avg_value[prop].append(output_props[prop][best_idx]) |
| input_val = input_props[i].get(prop, 0) |
| output_val = output_props[prop][best_idx] |
| |
| |
| avg_change[prop].append(abs(output_val - input_val)) |
| |
| if input_val == 0: |
| perc_change[prop].append((output_val - 0.1)/0.1) |
| else: |
| perc_change[prop].append(abs(input_val - output_val) / abs(input_val)) |
|
|
| |
| if normalize and prop in normalize: |
| input_val = normalize_prop(input_val, *normalize[prop]) |
| output_val = normalize_prop(output_val, *normalize[prop]) |
| |
| if input_val == 0: |
| norm_perc_change[prop].append((output_val - 0.1)/0.1) |
| else: |
| norm_perc_change[prop].append(abs(input_val - output_val) / abs(input_val)) |
|
|
| composite_change.append(np.mean([perc_change[prop][-1] for prop in property])) |
| if normalize: |
| norm_composite_change.append(np.mean([norm_perc_change[prop][-1] for prop in property])) |
| |
| fp_sim = compute_FP_sim(most_optimized_smiles) |
| diversity = 1 - np.mean(fp_sim[np.triu_indices(len(fp_sim), k=1)]) |
| num_success = len(most_optimized_smiles) |
| |
| num_valid = num_inputs - num_invalid |
| |
| |
| SR = num_success / num_inputs |
| SR_V = num_success / num_valid if num_valid else 0 |
| validity = num_valid / num_inputs |
|
|
| ret = { |
| "SR": SR*100, |
| "Sim": np.mean(similarity_with_input), |
| "RI": np.mean(composite_change), |
| "Val": validity*100, |
| "SR (V)": SR_V*100, |
| |
| "Div": diversity, |
| "SAS": np.mean(SAS), |
| "Norm RI": np.nanmean(norm_composite_change) |
| } |
| for prop in property: |
| ret[f"{prop}-APS"] = np.mean(avg_value[prop]) |
| ret[f"{prop}-impv"] = np.mean(avg_change[prop]) |
| ret[f"{prop}-impv%"] = np.mean(perc_change[prop]) * 100 |
| if normalize: |
| ret[f"{prop}-impv(n)%"] = np.mean(norm_perc_change[prop]) * 100 |
|
|
| return ret, most_optimized_smiles_props |