|
|
import streamlit as st |
|
|
import sys |
|
|
from fragment_embedder import FragmentEmbedder |
|
|
from morgan_desc import * |
|
|
from physchem_desc import * |
|
|
from rdkit import Chem |
|
|
import pandas as pd |
|
|
import os |
|
|
import random |
|
|
import numpy as np |
|
|
import joblib |
|
|
from rdkit import Chem |
|
|
from rdkit.Chem import Draw |
|
|
from rdkit.Chem import Draw |
|
|
from rdkit.Chem import AllChem |
|
|
from rdkit import DataStructs |
|
|
from rdkit.Chem import Descriptors |
|
|
from scipy import stats |
|
|
import textwrap |
|
|
from datasets import load_dataset |
|
|
import requests |
|
|
from io import BytesIO |
|
|
import urllib.request |
|
|
|
|
|
|
|
|
import warnings |
|
|
warnings.filterwarnings('ignore') |
|
|
|
|
|
st.set_page_config( |
|
|
page_title="Ligand Discovery 4: Fragment Predictions", |
|
|
page_icon=":home:", |
|
|
layout="wide", |
|
|
initial_sidebar_state="expanded" |
|
|
) |
|
|
|
|
|
st.markdown(""" |
|
|
<style> |
|
|
.css-13sdm1b.e16nr0p33 { |
|
|
margin-top: -75px; |
|
|
} |
|
|
</style> |
|
|
""", unsafe_allow_html=True) |
|
|
|
|
|
hide_streamlit_style = """ |
|
|
<style> |
|
|
#MainMenu {visibility: hidden;} |
|
|
footer {visibility: hidden;} |
|
|
#header {visibility: hidden;} |
|
|
</style> |
|
|
""" |
|
|
st.markdown(hide_streamlit_style, unsafe_allow_html=True) |
|
|
|
|
|
dataset = load_dataset('ligdis/data', data_files={"predictions.csv"}) |
|
|
df_predictions = dataset['train'].to_pandas() |
|
|
|
|
|
predictions_inchikeys = df_predictions["inchikey"].tolist() |
|
|
df_predictions = df_predictions.rename(columns={"inchikey": "InChIKey"}) |
|
|
|
|
|
dataset = load_dataset('ligdis/data', data_files={"applicability.csv"}) |
|
|
df_applicability = dataset['train'].to_pandas() |
|
|
|
|
|
df_predictions = pd.concat([df_predictions, df_applicability], axis=1) |
|
|
|
|
|
dataset = load_dataset('ligdis/data', data_files={"cemm_smiles.csv"}) |
|
|
cemm_smiles = dataset['train'].to_pandas() |
|
|
|
|
|
fid2smi = {} |
|
|
for r in cemm_smiles.values: |
|
|
fid2smi[r[0]] = r[1] |
|
|
|
|
|
fe = FragmentEmbedder() |
|
|
|
|
|
CRF_PATTERN = "CC1(CCC#C)N=N1" |
|
|
CRF_PATTERN_0 = "C#CC" |
|
|
CRF_PATTERN_1 = "N=N" |
|
|
|
|
|
dataset = load_dataset('ligdis/data', data_files={"all_fff_enamine.csv"}) |
|
|
enamine_catalog = dataset['train'].to_pandas() |
|
|
enamine_catalog_ids_set = set(enamine_catalog["catalog_id"]) |
|
|
enamine_catalog_dict = {} |
|
|
catalog2inchikey = {} |
|
|
smiles2catalog = {} |
|
|
for i, r in enumerate(enamine_catalog.values): |
|
|
enamine_catalog_dict[r[0]] = r[1] |
|
|
catalog2inchikey[r[0]] = predictions_inchikeys[i] |
|
|
smiles2catalog[r[1]] = r[0] |
|
|
|
|
|
|
|
|
def is_enamine_catalog_id(identifier): |
|
|
if identifier in enamine_catalog_ids_set: |
|
|
return True |
|
|
else: |
|
|
return False |
|
|
|
|
|
|
|
|
def is_enamine_smiles(identifier): |
|
|
if identifier in smiles2catalog: |
|
|
return True |
|
|
else: |
|
|
return False |
|
|
|
|
|
|
|
|
def is_ligand_discovery_id(identifier): |
|
|
if identifier in fid2smi: |
|
|
return True |
|
|
else: |
|
|
return False |
|
|
|
|
|
|
|
|
def is_valid_smiles(smiles): |
|
|
try: |
|
|
mol = Chem.MolFromSmiles(smiles) |
|
|
except: |
|
|
mol = None |
|
|
if mol is None: |
|
|
return False |
|
|
else: |
|
|
return True |
|
|
|
|
|
|
|
|
def has_crf(mol): |
|
|
pattern = CRF_PATTERN |
|
|
has_pattern = mol.HasSubstructMatch(Chem.MolFromSmarts(pattern)) |
|
|
if not has_pattern: |
|
|
if mol.HasSubstructMatch( |
|
|
Chem.MolFromSmarts(CRF_PATTERN_0) |
|
|
) and mol.HasSubstructMatch(Chem.MolFromSmarts(CRF_PATTERN_1)): |
|
|
return True |
|
|
else: |
|
|
return False |
|
|
return True |
|
|
|
|
|
dataset = load_dataset('ligdis/data', data_files={"model_catalog.csv"}) |
|
|
dm = dataset['train'].to_pandas() |
|
|
all_models = dm["model_name"].tolist() |
|
|
|
|
|
dataset = load_dataset('ligdis/data', data_files={"models_performance.tsv"}) |
|
|
dp = dataset['train'].to_pandas() |
|
|
|
|
|
model_display = {} |
|
|
model_description = {} |
|
|
for r in dm.values: |
|
|
model_display[r[0]] = r[1] |
|
|
model_description[r[0]] = r[2] |
|
|
model_auroc = {} |
|
|
for r in dp.values: |
|
|
model_auroc[r[0]] = r[1] |
|
|
|
|
|
prom_models = [x for x in dm["model_name"].tolist() if x.startswith("promiscuity")] |
|
|
sign_models = [x for x in dm["model_name"].tolist() if x.startswith("signature")] |
|
|
|
|
|
global_promiscuity_models = ["promiscuity_pxf0", "promiscuity_pxf1", "promiscuity_pxf2"] |
|
|
specific_promiscuity_models = ["promiscuity_fxp0_pxf0", "promiscuity_fxp1_pxf0","promiscuity_fxp2_pxf0", "promiscuity_fxp0_pxf1", "promiscuity_fxp1_pxf1", "promiscuity_fxp2_pxf1", "promiscuity_fxp0_pxf2", "promiscuity_fxp1_pxf2", "promiscuity_fxp2_pxf2"] |
|
|
|
|
|
def model_to_markdown(model_names): |
|
|
items = [] |
|
|
for mn in model_names: |
|
|
items += [ |
|
|
"{0} ({1:.3f}): {2}".format( |
|
|
model_display[mn].ljust(8), model_auroc[mn], model_description[mn] |
|
|
) |
|
|
] |
|
|
markdown_list = "\n".join(items) |
|
|
return markdown_list |
|
|
|
|
|
st.sidebar.title("Ligand Discovery 4: Fragment Predictions") |
|
|
|
|
|
placeholder_text = [] |
|
|
keys = random.sample(sorted(enamine_catalog_ids_set), 5) |
|
|
for k in keys: |
|
|
placeholder_text += [random.choice([k, enamine_catalog_dict[k]])] |
|
|
placeholder_text = "\n".join(placeholder_text) |
|
|
|
|
|
text_input = st.sidebar.text_area(label="Input your fully functionalized fragments:") |
|
|
inputs = [x.strip(" ") for x in text_input.split("\n")] |
|
|
inputs = [x for x in inputs if x != ""] |
|
|
if len(inputs) > 999: |
|
|
st.sidebar.error("Please limit the number of input fragments to 999.") |
|
|
|
|
|
st.sidebar.info("This tool expects fully functionalized fragments (FFF) as input, including the diazirine+alkyne probe (CRF). We have tailored the chemical space of the predictions to FFFs; the app will through an error if any of the input molecules does not contain a CRF region. Enamine provides a good [catalog](https://enamine.net/compound-libraries/fragment-libraries/fully-functionalized-probe-library) of FFFs. For a quick test input, use any of the options below") |
|
|
|
|
|
example_0 = ["Z5645472552", "Z5645472643", "Z5645472785"] |
|
|
st.sidebar.markdown("**Input Enamine FFF identifiers...**") |
|
|
st.sidebar.text("\n".join(example_0)) |
|
|
|
|
|
example_1 = ["C#CCCC1(CCCNC(=O)C(Cc2c[nH]c3ncccc23)NC(=O)OC(C)(C)C)N=N1", "C#CCCC1(CCCNC(=O)[C@H]2CCC(=O)NC2)N=N1", "C#CCCC1(CCCNC(=O)CSc2ncc(C(=O)OCC)c(N)n2)N=N1"] |
|
|
st.sidebar.markdown("**Input FFF SMILES strings...**") |
|
|
st.sidebar.text("\n".join(example_1)) |
|
|
|
|
|
example_2 = ["C310", "C045", "C391"] |
|
|
st.sidebar.markdown("**Input Ligand Discovery identifiers...**") |
|
|
st.sidebar.text("\n".join(example_2)) |
|
|
|
|
|
example_3 = ["Z5645486561", "C#CCCCC1(CCCC(=O)N2CCC(C(C(=O)O)c3ccc(C)cc3)CC2)N=N1", "C279"] |
|
|
st.sidebar.markdown("**Input a mix of the above identifiers**") |
|
|
st.sidebar.text("\n".join(example_3)) |
|
|
|
|
|
R = [] |
|
|
all_inputs_are_valid = True |
|
|
for i, inp in enumerate(inputs): |
|
|
input_id = "input-{0}".format(str(i).zfill(3)) |
|
|
if is_enamine_catalog_id(inp): |
|
|
smiles = enamine_catalog_dict[inp] |
|
|
inchikey = catalog2inchikey[inp] |
|
|
r = [inp, smiles, inchikey] |
|
|
elif is_ligand_discovery_id(inp): |
|
|
smiles = fid2smi[inp] |
|
|
inchikey = Chem.MolToInchiKey(Chem.MolFromSmiles(smiles)) |
|
|
r = [inp, smiles, inchikey] |
|
|
elif is_enamine_smiles(inp): |
|
|
smiles = inp |
|
|
inp = smiles2catalog[smiles] |
|
|
inchikey = catalog2inchikey[inp] |
|
|
r = [inp, smiles, inchikey] |
|
|
elif is_valid_smiles(inp): |
|
|
mol = Chem.MolFromSmiles(inp) |
|
|
if has_crf(mol): |
|
|
inchikey = Chem.rdinchi.InchiToInchiKey(Chem.MolToInchi(mol)) |
|
|
r = [inchikey, inp, inchikey] |
|
|
else: |
|
|
st.error( |
|
|
"Input SMILES {0} does not have the CRF. The CRF pattern is {1}.".format( |
|
|
inp, CRF_PATTERN |
|
|
) |
|
|
) |
|
|
all_inputs_are_valid = False |
|
|
else: |
|
|
st.error( |
|
|
"Input {0} is not valid. Please enter a valid fully-functionalized fragment SMILES string or an Enamine catalog identifier of a fully-functionalized fragment".format( |
|
|
inp |
|
|
) |
|
|
) |
|
|
all_inputs_are_valid = False |
|
|
R += [r] |
|
|
|
|
|
def get_fragment_image(smiles): |
|
|
m = Chem.MolFromSmiles(smiles) |
|
|
AllChem.Compute2DCoords(m) |
|
|
im = Draw.MolToImage(m, size=(200, 200)) |
|
|
return im |
|
|
|
|
|
st.markdown( |
|
|
""" |
|
|
Explanation for Output: The results are displayed in 4 Columns. |
|
|
1. **Structure** of the FFF, InChi, Enamine ID |
|
|
2. **Chemical space**: Displays the Molecular Weight (*MW*), Walden-Crippen *LogP* and Tanimoto Similarity to the most similar fragment (*Sim-1*) and third most similar fragment (*Sim-3*) in the training set |
|
|
3. **Promiscuity Predictions** based on 12 Model: 3 Global (section **A**) and 9 Specific (section **B**) |
|
|
4. **Ontology Predictions** based on 9 _Signature_ Models derived from protein annotations of multiple scopes - from domains and families to molecular functions and cellular localization |
|
|
""" |
|
|
) |
|
|
|
|
|
myCol = st.columns(3) |
|
|
|
|
|
with myCol[0]: |
|
|
st.subheader("Promiscuity Predictions") |
|
|
st.markdown("**A. Global models**") |
|
|
st.text(model_to_markdown(global_promiscuity_models)) |
|
|
st.markdown("**C. Aggregated score**") |
|
|
st.text("Sum : Sum of individual promiscuity predictors") |
|
|
with myCol[1]: |
|
|
st.text("") |
|
|
st.text("") |
|
|
st.markdown("**B. Specific models**") |
|
|
st.text(model_to_markdown(specific_promiscuity_models)) |
|
|
|
|
|
with myCol[2]: |
|
|
st.subheader("Ontology Predictions") |
|
|
signature_models = ["signature_{0}".format(i) for i in range(10)] |
|
|
st.text(model_to_markdown(signature_models)) |
|
|
|
|
|
st.markdown( |
|
|
""" |
|
|
- Model score (range 0 -> 1) corresponds to the mean AUROC in 10 train-test splits |
|
|
- Percentages in parenthesis denote the percentile of the score across the Enamine collection of FFFs (>250k compounds). for example, in "Sign-4: 0.02 (35.7%)", **35.7** is the percentile of score. |
|
|
- The exclamation sign (!) next to the prediction output indicates that the corresponding model has an AUROC accuracy below 0.7 (*! is a warning sign*) |
|
|
""" |
|
|
) |
|
|
st.divider() |
|
|
|
|
|
if all_inputs_are_valid and len(R) > 0: |
|
|
sum_of_promiscuities = np.sum( |
|
|
df_predictions[global_promiscuity_models + specific_promiscuity_models], axis=1 |
|
|
) |
|
|
df = pd.DataFrame(R, columns=["Identifier", "SMILES", "InChIKey"]) |
|
|
|
|
|
my_inchikeys = df["InChIKey"].tolist() |
|
|
|
|
|
df_done = df[df["InChIKey"].isin(predictions_inchikeys)] |
|
|
df_todo = df[~df["InChIKey"].isin(predictions_inchikeys)] |
|
|
|
|
|
if df_done.shape[0] > 0: |
|
|
df_done = df_done.merge( |
|
|
df_predictions, on="InChIKey", how="left" |
|
|
).drop_duplicates() |
|
|
|
|
|
if df_todo.shape[0] > 0: |
|
|
X = fe.transform(df_todo["SMILES"].tolist()) |
|
|
|
|
|
st.info("Making predictions... this make take a few seconds. Please be patient. We may experience high traffic. If something goes wrong, please try again later.") |
|
|
|
|
|
progress_bar = st.progress(0) |
|
|
|
|
|
for i, model_name in enumerate(all_models): |
|
|
url = ''.join(('https://huggingface.co/ligdis/fpred/resolve/main/', model_name, '.joblib')) |
|
|
with urllib.request.urlopen(url) as response: |
|
|
model = joblib.load(BytesIO(response.read())) |
|
|
vals = model.predict(X) |
|
|
del model |
|
|
progress_bar.progress((i + 1) / len(all_models)) |
|
|
df_todo[model_name] = vals |
|
|
|
|
|
url = 'https://huggingface.co/ligdis/fpred/resolve/main/cemm_ecfp_2_1024.joblib' |
|
|
with urllib.request.urlopen(url) as response: |
|
|
dataset_fps = joblib.load(BytesIO(response.read())) |
|
|
|
|
|
all_query_smiles = df_todo["SMILES"].tolist() |
|
|
|
|
|
sims_1 = [] |
|
|
sims_3 = [] |
|
|
logps = [] |
|
|
mwts = [] |
|
|
for query_smiles in all_query_smiles: |
|
|
query_mol = Chem.MolFromSmiles(query_smiles) |
|
|
query_fp = AllChem.GetMorganFingerprintAsBitVect(query_mol, 2, nBits=1024) |
|
|
similarity_scores = [ |
|
|
DataStructs.TanimotoSimilarity(query_fp, dataset_fp) |
|
|
for dataset_fp in dataset_fps |
|
|
] |
|
|
sorted_scores_indices = sorted( |
|
|
enumerate(similarity_scores), key=lambda x: x[1], reverse=True |
|
|
) |
|
|
top_n = 3 |
|
|
sims_1 += [sorted_scores_indices[0][1]] |
|
|
sims_3 += [sorted_scores_indices[2][1]] |
|
|
logps += [Descriptors.MolLogP(query_mol)] |
|
|
mwts += [Descriptors.MolWt(query_mol)] |
|
|
results = {"sims-1": sims_1, "sims-3": sims_3, "logp": logps, "mw": mwts} |
|
|
for k in ["sims-1", "sims-3", "logp", "mw"]: |
|
|
df_todo[k] = results[k] |
|
|
|
|
|
if df_done.shape[0] > 0 and df_todo.shape[0] > 0: |
|
|
df_ = pd.concat([df_done, df_todo]) |
|
|
else: |
|
|
if df_done.shape[0] > 0: |
|
|
df_ = df_done |
|
|
else: |
|
|
df_ = df_todo |
|
|
df_ = df_.drop(columns=["Identifier", "SMILES"]) |
|
|
df = df.merge(df_, on="InChIKey", how="left") |
|
|
df.drop_duplicates(subset=['InChIKey'], keep='first', inplace=True, ignore_index=True) |
|
|
df = df.rename(columns=model_display) |
|
|
applicability_display = { |
|
|
"mw": "MW", |
|
|
"logp": "LogP", |
|
|
"sims-1": "Sim-1", |
|
|
"sims-3": "Sim-3", |
|
|
} |
|
|
df = df.rename(columns=applicability_display) |
|
|
|
|
|
df_predictions = df_predictions.rename(columns=model_display) |
|
|
df_predictions = df_predictions.rename(columns=applicability_display) |
|
|
|
|
|
prom_columns = [] |
|
|
for i in range(3): |
|
|
prom_columns += ["Prom-{0}".format(i)] |
|
|
for j in range(3): |
|
|
prom_columns += ["Prom-{0}-{0}".format(i, j)] |
|
|
|
|
|
def identifiers_text(ik, smi, ident): |
|
|
s = ["{0}".format(ik), "{0}".format(smi)] |
|
|
if ik != ident: |
|
|
s += ["{0}".format(ident)] |
|
|
return "\n".join(s) |
|
|
|
|
|
def score_text(v, c): |
|
|
all_scores = np.array(df_predictions[c]) |
|
|
perc = stats.percentileofscore(all_scores, v) |
|
|
t = "{0}: {1:.2f} ({2:.1f}%)".format(c.ljust(8), v, perc).ljust(22) |
|
|
if c == "Sign-4" or c == "Sign-7" or c == "Sign-3": |
|
|
t += " (!)" |
|
|
return t |
|
|
|
|
|
def score_texts(vs, cs): |
|
|
all_texts = [] |
|
|
for v, c in zip(vs, cs): |
|
|
all_texts += [score_text(v, c)] |
|
|
return "\n".join(all_texts) |
|
|
|
|
|
dorig = pd.DataFrame({"InChIKey": my_inchikeys}) |
|
|
df = dorig.merge(df, on="InChIKey", how="left") |
|
|
df = df.reset_index(inplace=False, drop=True) |
|
|
for i, r in enumerate(df.iterrows()): |
|
|
v = r[1] |
|
|
st.markdown("#### Input {0}: `{1}`".format(i+1, inputs[r[0]])) |
|
|
cols = st.columns(4) |
|
|
cols[0].markdown("**Fragment**") |
|
|
cols[0].image(get_fragment_image(v["SMILES"])) |
|
|
cols[0].text(identifiers_text(v["InChIKey"], v["SMILES"], v["Identifier"])) |
|
|
|
|
|
cols[1].markdown("**Chemical space**") |
|
|
my_cols = ["MW", "LogP", "Sim-1", "Sim-3"] |
|
|
cols[1].text(score_texts(v[my_cols], my_cols)) |
|
|
|
|
|
cols[2].markdown("**Promiscuity**") |
|
|
sum_prom = np.sum(v[prom_columns]) |
|
|
perc_prom = stats.percentileofscore(sum_of_promiscuities, sum_prom) |
|
|
my_cols = ["Prom-0", "Prom-1", "Prom-2"] |
|
|
cols[2].text(score_texts(v[my_cols], my_cols)) |
|
|
|
|
|
my_cols = [ |
|
|
"Prom-0-0", |
|
|
"Prom-0-1", |
|
|
"Prom-0-2", |
|
|
"Prom-1-0", |
|
|
"Prom-1-1", |
|
|
"Prom-1-2", |
|
|
"Prom-2-0", |
|
|
"Prom-2-1", |
|
|
"Prom-2-2", |
|
|
] |
|
|
cols[2].text(score_texts(v[my_cols], my_cols)) |
|
|
|
|
|
cols[2].text("Sum : {0:.2f} ({1:.1f}%)".format(sum_prom, perc_prom)) |
|
|
|
|
|
cols[3].markdown("**Signatures**") |
|
|
my_cols = ["Sign-{0}".format(i) for i in range(10)] |
|
|
cols[3].text(score_texts(v[my_cols], my_cols)) |
|
|
st.divider() |
|
|
|
|
|
def convert_df(df): |
|
|
return df.to_csv(index=False).encode("utf-8") |
|
|
|
|
|
csv = convert_df(df) |
|
|
|
|
|
st.download_button( |
|
|
"Download as CSV", csv, "predictions.csv", "text/csv", key="download-csv" |
|
|
) |
|
|
|
|
|
|