Update app.py
Browse files
app.py
CHANGED
|
@@ -16,6 +16,7 @@ from rdkit.Chem import AllChem
|
|
| 16 |
from rdkit import DataStructs
|
| 17 |
from rdkit.Chem import Descriptors
|
| 18 |
from scipy import stats
|
|
|
|
| 19 |
from datasets import load_dataset
|
| 20 |
import requests
|
| 21 |
from io import BytesIO
|
|
@@ -25,12 +26,29 @@ import warnings
|
|
| 25 |
warnings.filterwarnings('ignore')
|
| 26 |
|
| 27 |
st.set_page_config(
|
| 28 |
-
page_title="Fragment
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
)
|
| 33 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
dataset = load_dataset('ligdis/data', data_files={"predictions.csv"})
|
| 35 |
df_predictions = dataset['train'].to_pandas()
|
| 36 |
|
|
@@ -111,9 +129,6 @@ def has_crf(mol):
|
|
| 111 |
return False
|
| 112 |
return True
|
| 113 |
|
| 114 |
-
|
| 115 |
-
st.title("Fully-functionalized fragment predictions")
|
| 116 |
-
|
| 117 |
dataset = load_dataset('ligdis/data', data_files={"model_catalog.csv"})
|
| 118 |
dm = dataset['train'].to_pandas()
|
| 119 |
all_models = dm["model_name"].tolist()
|
|
@@ -133,6 +148,9 @@ for r in dp.values:
|
|
| 133 |
prom_models = [x for x in dm["model_name"].tolist() if x.startswith("promiscuity")]
|
| 134 |
sign_models = [x for x in dm["model_name"].tolist() if x.startswith("signature")]
|
| 135 |
|
|
|
|
|
|
|
|
|
|
| 136 |
def model_to_markdown(model_names):
|
| 137 |
items = []
|
| 138 |
for mn in model_names:
|
|
@@ -144,52 +162,7 @@ def model_to_markdown(model_names):
|
|
| 144 |
markdown_list = "\n".join(items)
|
| 145 |
return markdown_list
|
| 146 |
|
| 147 |
-
st.sidebar.title("
|
| 148 |
-
|
| 149 |
-
st.sidebar.markdown("**Global models**")
|
| 150 |
-
|
| 151 |
-
global_promiscuity_models = ["promiscuity_pxf0", "promiscuity_pxf1", "promiscuity_pxf2"]
|
| 152 |
-
st.sidebar.text(model_to_markdown(global_promiscuity_models))
|
| 153 |
-
|
| 154 |
-
st.sidebar.markdown("**Specific models**")
|
| 155 |
-
|
| 156 |
-
specific_promiscuity_models = [
|
| 157 |
-
"promiscuity_fxp0_pxf0",
|
| 158 |
-
"promiscuity_fxp1_pxf0",
|
| 159 |
-
"promiscuity_fxp2_pxf0",
|
| 160 |
-
"promiscuity_fxp0_pxf1",
|
| 161 |
-
"promiscuity_fxp1_pxf1",
|
| 162 |
-
"promiscuity_fxp2_pxf1",
|
| 163 |
-
"promiscuity_fxp0_pxf2",
|
| 164 |
-
"promiscuity_fxp1_pxf2",
|
| 165 |
-
"promiscuity_fxp2_pxf2",
|
| 166 |
-
]
|
| 167 |
-
st.sidebar.text(model_to_markdown(specific_promiscuity_models))
|
| 168 |
-
|
| 169 |
-
st.sidebar.markdown("**Aggregated score**")
|
| 170 |
-
st.sidebar.text("Sum : Sum of individual promiscuity predictors.")
|
| 171 |
-
|
| 172 |
-
st.sidebar.title("Signature models")
|
| 173 |
-
signature_models = ["signature_{0}".format(i) for i in range(10)]
|
| 174 |
-
st.sidebar.text(model_to_markdown(signature_models))
|
| 175 |
-
|
| 176 |
-
st.sidebar.title("Chemical space")
|
| 177 |
-
s = ["MW : Molecular weight.",
|
| 178 |
-
"LogP : Walden-Crippen LogP.",
|
| 179 |
-
"Sim-1 : Tanimoto similarity to the most ",
|
| 180 |
-
" similar fragment in the training set.",
|
| 181 |
-
"Sim-3 : Tanimoto similarity to the third ",
|
| 182 |
-
" most similar fragment in the training set."]
|
| 183 |
-
|
| 184 |
-
st.sidebar.text("\n".join(s))
|
| 185 |
-
|
| 186 |
-
st.sidebar.text("* The score in parenthesis corresponds to the mean AUROC in 10 train-test splits")
|
| 187 |
-
|
| 188 |
-
st.sidebar.markdown("**In the main page...**")
|
| 189 |
-
s = textwrap.wrap("1. Percentages in parenthesis denote the percentile of the score across the Enamine collection of FFFs (>250k compounds)", width=60)
|
| 190 |
-
st.sidebar.text("\n".join(s))
|
| 191 |
-
s = textwrap.wrap("2. The exclamation sign (!) indicates that the corresponding model has an AUROC accuracy below 0.7.", width=60)
|
| 192 |
-
st.sidebar.text("\n".join(s))
|
| 193 |
|
| 194 |
placeholder_text = []
|
| 195 |
keys = random.sample(sorted(enamine_catalog_ids_set), 5)
|
|
@@ -197,11 +170,29 @@ for k in keys:
|
|
| 197 |
placeholder_text += [random.choice([k, enamine_catalog_dict[k]])]
|
| 198 |
placeholder_text = "\n".join(placeholder_text)
|
| 199 |
|
| 200 |
-
text_input = st.text_area(label="Input your fully functionalized fragments:")
|
| 201 |
inputs = [x.strip(" ") for x in text_input.split("\n")]
|
| 202 |
inputs = [x for x in inputs if x != ""]
|
| 203 |
if len(inputs) > 999:
|
| 204 |
-
st.error("Please limit the number of input fragments to 999.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 205 |
|
| 206 |
R = []
|
| 207 |
all_inputs_are_valid = True
|
|
@@ -241,13 +232,50 @@ for i, inp in enumerate(inputs):
|
|
| 241 |
all_inputs_are_valid = False
|
| 242 |
R += [r]
|
| 243 |
|
| 244 |
-
|
| 245 |
def get_fragment_image(smiles):
|
| 246 |
m = Chem.MolFromSmiles(smiles)
|
| 247 |
AllChem.Compute2DCoords(m)
|
| 248 |
im = Draw.MolToImage(m, size=(200, 200))
|
| 249 |
return im
|
| 250 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 251 |
if all_inputs_are_valid and len(R) > 0:
|
| 252 |
sum_of_promiscuities = np.sum(
|
| 253 |
df_predictions[global_promiscuity_models + specific_promiscuity_models], axis=1
|
|
@@ -375,7 +403,6 @@ if all_inputs_are_valid and len(R) > 0:
|
|
| 375 |
cols[2].markdown("**Promiscuity**")
|
| 376 |
sum_prom = np.sum(v[prom_columns])
|
| 377 |
perc_prom = stats.percentileofscore(sum_of_promiscuities, sum_prom)
|
| 378 |
-
cols[2].text("Sum : {0:.2f} ({1:.1f}%)".format(sum_prom, perc_prom))
|
| 379 |
my_cols = ["Prom-0", "Prom-1", "Prom-2"]
|
| 380 |
cols[2].text(score_texts(v[my_cols], my_cols))
|
| 381 |
|
|
@@ -392,9 +419,12 @@ if all_inputs_are_valid and len(R) > 0:
|
|
| 392 |
]
|
| 393 |
cols[2].text(score_texts(v[my_cols], my_cols))
|
| 394 |
|
|
|
|
|
|
|
| 395 |
cols[3].markdown("**Signatures**")
|
| 396 |
my_cols = ["Sign-{0}".format(i) for i in range(10)]
|
| 397 |
cols[3].text(score_texts(v[my_cols], my_cols))
|
|
|
|
| 398 |
|
| 399 |
def convert_df(df):
|
| 400 |
return df.to_csv(index=False).encode("utf-8")
|
|
@@ -404,32 +434,4 @@ if all_inputs_are_valid and len(R) > 0:
|
|
| 404 |
st.download_button(
|
| 405 |
"Download as CSV", csv, "predictions.csv", "text/csv", key="download-csv"
|
| 406 |
)
|
| 407 |
-
|
| 408 |
-
else:
|
| 409 |
-
st.info(
|
| 410 |
-
"This tool expects fully functionalized fragments (FFF) as input, including the diazirine+alkyne probe (CRF). We have tailored the chemical space of the predictions to FFFs; the app will through an error if any of the input molecules does not contain a CRF region. Enamine provides a good [catalog](https://enamine.net/compound-libraries/fragment-libraries/fully-functionalized-probe-library) of FFFs. For a quick test input, use any of the options below."
|
| 411 |
-
)
|
| 412 |
-
|
| 413 |
-
example_0 = ["Z5645472552", "Z5645472643", "Z5645472785"]
|
| 414 |
-
st.markdown("**Input Enamine FFF identifiers...**")
|
| 415 |
-
st.text("\n".join(example_0))
|
| 416 |
-
|
| 417 |
-
example_1 = [
|
| 418 |
-
"C#CCCC1(CCCNC(=O)C(Cc2c[nH]c3ncccc23)NC(=O)OC(C)(C)C)N=N1",
|
| 419 |
-
"C#CCCC1(CCCNC(=O)[C@H]2CCC(=O)NC2)N=N1",
|
| 420 |
-
"C#CCCC1(CCCNC(=O)CSc2ncc(C(=O)OCC)c(N)n2)N=N1",
|
| 421 |
-
]
|
| 422 |
-
st.markdown("**Input FFF SMILES strings...**")
|
| 423 |
-
st.text("\n".join(example_1))
|
| 424 |
-
|
| 425 |
-
example_2 = ["C310", "C045", "C391"]
|
| 426 |
-
st.markdown("**Input Ligand Discovery identifiers...**")
|
| 427 |
-
st.text("\n".join(example_2))
|
| 428 |
-
|
| 429 |
-
example_3 = [
|
| 430 |
-
"Z5645486561",
|
| 431 |
-
"C#CCCCC1(CCCC(=O)N2CCC(C(C(=O)O)c3ccc(C)cc3)CC2)N=N1",
|
| 432 |
-
"C279",
|
| 433 |
-
]
|
| 434 |
-
st.markdown("**Input a mix of the above identifiers**")
|
| 435 |
-
st.text("\n".join(example_3))
|
|
|
|
| 16 |
from rdkit import DataStructs
|
| 17 |
from rdkit.Chem import Descriptors
|
| 18 |
from scipy import stats
|
| 19 |
+
import textwrap
|
| 20 |
from datasets import load_dataset
|
| 21 |
import requests
|
| 22 |
from io import BytesIO
|
|
|
|
| 26 |
warnings.filterwarnings('ignore')
|
| 27 |
|
| 28 |
st.set_page_config(
|
| 29 |
+
page_title="Ligand Discovery 4: Fragment Predictions",
|
| 30 |
+
page_icon=":home:",
|
| 31 |
+
layout="wide", # "centered",
|
| 32 |
+
initial_sidebar_state="expanded"
|
| 33 |
)
|
| 34 |
|
| 35 |
+
st.markdown("""
|
| 36 |
+
<style>
|
| 37 |
+
.css-13sdm1b.e16nr0p33 {
|
| 38 |
+
margin-top: -75px;
|
| 39 |
+
}
|
| 40 |
+
</style>
|
| 41 |
+
""", unsafe_allow_html=True)
|
| 42 |
+
|
| 43 |
+
hide_streamlit_style = """
|
| 44 |
+
<style>
|
| 45 |
+
#MainMenu {visibility: hidden;}
|
| 46 |
+
footer {visibility: hidden;}
|
| 47 |
+
#header {visibility: hidden;}
|
| 48 |
+
</style>
|
| 49 |
+
"""
|
| 50 |
+
st.markdown(hide_streamlit_style, unsafe_allow_html=True)
|
| 51 |
+
|
| 52 |
dataset = load_dataset('ligdis/data', data_files={"predictions.csv"})
|
| 53 |
df_predictions = dataset['train'].to_pandas()
|
| 54 |
|
|
|
|
| 129 |
return False
|
| 130 |
return True
|
| 131 |
|
|
|
|
|
|
|
|
|
|
| 132 |
dataset = load_dataset('ligdis/data', data_files={"model_catalog.csv"})
|
| 133 |
dm = dataset['train'].to_pandas()
|
| 134 |
all_models = dm["model_name"].tolist()
|
|
|
|
| 148 |
prom_models = [x for x in dm["model_name"].tolist() if x.startswith("promiscuity")]
|
| 149 |
sign_models = [x for x in dm["model_name"].tolist() if x.startswith("signature")]
|
| 150 |
|
| 151 |
+
global_promiscuity_models = ["promiscuity_pxf0", "promiscuity_pxf1", "promiscuity_pxf2"]
|
| 152 |
+
specific_promiscuity_models = ["promiscuity_fxp0_pxf0", "promiscuity_fxp1_pxf0","promiscuity_fxp2_pxf0", "promiscuity_fxp0_pxf1", "promiscuity_fxp1_pxf1", "promiscuity_fxp2_pxf1", "promiscuity_fxp0_pxf2", "promiscuity_fxp1_pxf2", "promiscuity_fxp2_pxf2"]
|
| 153 |
+
|
| 154 |
def model_to_markdown(model_names):
|
| 155 |
items = []
|
| 156 |
for mn in model_names:
|
|
|
|
| 162 |
markdown_list = "\n".join(items)
|
| 163 |
return markdown_list
|
| 164 |
|
| 165 |
+
st.sidebar.title("Ligand Discovery 4: Fragment Predictions")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 166 |
|
| 167 |
placeholder_text = []
|
| 168 |
keys = random.sample(sorted(enamine_catalog_ids_set), 5)
|
|
|
|
| 170 |
placeholder_text += [random.choice([k, enamine_catalog_dict[k]])]
|
| 171 |
placeholder_text = "\n".join(placeholder_text)
|
| 172 |
|
| 173 |
+
text_input = st.sidebar.text_area(label="Input your fully functionalized fragments:")
|
| 174 |
inputs = [x.strip(" ") for x in text_input.split("\n")]
|
| 175 |
inputs = [x for x in inputs if x != ""]
|
| 176 |
if len(inputs) > 999:
|
| 177 |
+
st.sidebar.error("Please limit the number of input fragments to 999.")
|
| 178 |
+
|
| 179 |
+
st.sidebar.info("This tool expects fully functionalized fragments (FFF) as input, including the diazirine+alkyne probe (CRF). We have tailored the chemical space of the predictions to FFFs; the app will through an error if any of the input molecules does not contain a CRF region. Enamine provides a good [catalog](https://enamine.net/compound-libraries/fragment-libraries/fully-functionalized-probe-library) of FFFs. For a quick test input, use any of the options below")
|
| 180 |
+
|
| 181 |
+
example_0 = ["Z5645472552", "Z5645472643", "Z5645472785"]
|
| 182 |
+
st.sidebar.markdown("**Input Enamine FFF identifiers...**")
|
| 183 |
+
st.sidebar.text("\n".join(example_0))
|
| 184 |
+
|
| 185 |
+
example_1 = ["C#CCCC1(CCCNC(=O)C(Cc2c[nH]c3ncccc23)NC(=O)OC(C)(C)C)N=N1", "C#CCCC1(CCCNC(=O)[C@H]2CCC(=O)NC2)N=N1", "C#CCCC1(CCCNC(=O)CSc2ncc(C(=O)OCC)c(N)n2)N=N1"]
|
| 186 |
+
st.sidebar.markdown("**Input FFF SMILES strings...**")
|
| 187 |
+
st.sidebar.text("\n".join(example_1))
|
| 188 |
+
|
| 189 |
+
example_2 = ["C310", "C045", "C391"]
|
| 190 |
+
st.sidebar.markdown("**Input Ligand Discovery identifiers...**")
|
| 191 |
+
st.sidebar.text("\n".join(example_2))
|
| 192 |
+
|
| 193 |
+
example_3 = ["Z5645486561", "C#CCCCC1(CCCC(=O)N2CCC(C(C(=O)O)c3ccc(C)cc3)CC2)N=N1", "C279"]
|
| 194 |
+
st.sidebar.markdown("**Input a mix of the above identifiers**")
|
| 195 |
+
st.sidebar.text("\n".join(example_3))
|
| 196 |
|
| 197 |
R = []
|
| 198 |
all_inputs_are_valid = True
|
|
|
|
| 232 |
all_inputs_are_valid = False
|
| 233 |
R += [r]
|
| 234 |
|
|
|
|
| 235 |
def get_fragment_image(smiles):
|
| 236 |
m = Chem.MolFromSmiles(smiles)
|
| 237 |
AllChem.Compute2DCoords(m)
|
| 238 |
im = Draw.MolToImage(m, size=(200, 200))
|
| 239 |
return im
|
| 240 |
|
| 241 |
+
st.markdown(
|
| 242 |
+
"""
|
| 243 |
+
Explanation for Output: The results are displayed in 4 Columns.
|
| 244 |
+
1. **Structure** of the FFF, InChi, Enamine ID
|
| 245 |
+
2. **Chemical space**: Displays the Molecular Weight (*MW*), Walden-Crippen *LogP* and Tanimoto Similarity to the most similar fragment (*Sim-1*) and third most similar fragment (*Sim-3*) in the training set
|
| 246 |
+
3. **Promiscuity Predictions** based on 12 Model: 3 Global (section **A**) and 9 Specific (section **B**)
|
| 247 |
+
4. **Ontology Predictions** based on 9 _Signature_ Models derived from protein annotations of multiple scopes - from domains and families to molecular functions and cellular localization
|
| 248 |
+
"""
|
| 249 |
+
)
|
| 250 |
+
|
| 251 |
+
myCol = st.columns(3)
|
| 252 |
+
|
| 253 |
+
with myCol[0]:
|
| 254 |
+
st.subheader("Promiscuity Predictions")
|
| 255 |
+
st.markdown("**A. Global models**")
|
| 256 |
+
st.text(model_to_markdown(global_promiscuity_models))
|
| 257 |
+
st.markdown("**C. Aggregated score**")
|
| 258 |
+
st.text("Sum : Sum of individual promiscuity predictors")
|
| 259 |
+
with myCol[1]:
|
| 260 |
+
st.text("")
|
| 261 |
+
st.text("")
|
| 262 |
+
st.markdown("**B. Specific models**")
|
| 263 |
+
st.text(model_to_markdown(specific_promiscuity_models))
|
| 264 |
+
|
| 265 |
+
with myCol[2]:
|
| 266 |
+
st.subheader("Ontology Predictions")
|
| 267 |
+
signature_models = ["signature_{0}".format(i) for i in range(10)]
|
| 268 |
+
st.text(model_to_markdown(signature_models))
|
| 269 |
+
|
| 270 |
+
st.markdown(
|
| 271 |
+
"""
|
| 272 |
+
- Model score (range 0 -> 1) corresponds to the mean AUROC in 10 train-test splits
|
| 273 |
+
- Percentages in parenthesis denote the percentile of the score across the Enamine collection of FFFs (>250k compounds). for example, in "Sign-4: 0.02 (35.7%)", **35.7** is the percentile of score.
|
| 274 |
+
- The exclamation sign (!) next to the prediction output indicates that the corresponding model has an AUROC accuracy below 0.7 (*! is a warning sign*)
|
| 275 |
+
"""
|
| 276 |
+
)
|
| 277 |
+
st.divider()
|
| 278 |
+
|
| 279 |
if all_inputs_are_valid and len(R) > 0:
|
| 280 |
sum_of_promiscuities = np.sum(
|
| 281 |
df_predictions[global_promiscuity_models + specific_promiscuity_models], axis=1
|
|
|
|
| 403 |
cols[2].markdown("**Promiscuity**")
|
| 404 |
sum_prom = np.sum(v[prom_columns])
|
| 405 |
perc_prom = stats.percentileofscore(sum_of_promiscuities, sum_prom)
|
|
|
|
| 406 |
my_cols = ["Prom-0", "Prom-1", "Prom-2"]
|
| 407 |
cols[2].text(score_texts(v[my_cols], my_cols))
|
| 408 |
|
|
|
|
| 419 |
]
|
| 420 |
cols[2].text(score_texts(v[my_cols], my_cols))
|
| 421 |
|
| 422 |
+
cols[2].text("Sum : {0:.2f} ({1:.1f}%)".format(sum_prom, perc_prom))
|
| 423 |
+
|
| 424 |
cols[3].markdown("**Signatures**")
|
| 425 |
my_cols = ["Sign-{0}".format(i) for i in range(10)]
|
| 426 |
cols[3].text(score_texts(v[my_cols], my_cols))
|
| 427 |
+
st.divider()
|
| 428 |
|
| 429 |
def convert_df(df):
|
| 430 |
return df.to_csv(index=False).encode("utf-8")
|
|
|
|
| 434 |
st.download_button(
|
| 435 |
"Download as CSV", csv, "predictions.csv", "text/csv", key="download-csv"
|
| 436 |
)
|
| 437 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|