ligdis commited on
Commit
7fdd79e
·
verified ·
1 Parent(s): c03935d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +88 -86
app.py CHANGED
@@ -16,6 +16,7 @@ from rdkit.Chem import AllChem
16
  from rdkit import DataStructs
17
  from rdkit.Chem import Descriptors
18
  from scipy import stats
 
19
  from datasets import load_dataset
20
  import requests
21
  from io import BytesIO
@@ -25,12 +26,29 @@ import warnings
25
  warnings.filterwarnings('ignore')
26
 
27
  st.set_page_config(
28
- page_title="Fragment predictor app",
29
- layout="wide",
30
- initial_sidebar_state="expanded",
31
- page_icon=None,
32
  )
33
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  dataset = load_dataset('ligdis/data', data_files={"predictions.csv"})
35
  df_predictions = dataset['train'].to_pandas()
36
 
@@ -111,9 +129,6 @@ def has_crf(mol):
111
  return False
112
  return True
113
 
114
-
115
- st.title("Fully-functionalized fragment predictions")
116
-
117
  dataset = load_dataset('ligdis/data', data_files={"model_catalog.csv"})
118
  dm = dataset['train'].to_pandas()
119
  all_models = dm["model_name"].tolist()
@@ -133,6 +148,9 @@ for r in dp.values:
133
  prom_models = [x for x in dm["model_name"].tolist() if x.startswith("promiscuity")]
134
  sign_models = [x for x in dm["model_name"].tolist() if x.startswith("signature")]
135
 
 
 
 
136
  def model_to_markdown(model_names):
137
  items = []
138
  for mn in model_names:
@@ -144,52 +162,7 @@ def model_to_markdown(model_names):
144
  markdown_list = "\n".join(items)
145
  return markdown_list
146
 
147
- st.sidebar.title("Promiscuity models")
148
-
149
- st.sidebar.markdown("**Global models**")
150
-
151
- global_promiscuity_models = ["promiscuity_pxf0", "promiscuity_pxf1", "promiscuity_pxf2"]
152
- st.sidebar.text(model_to_markdown(global_promiscuity_models))
153
-
154
- st.sidebar.markdown("**Specific models**")
155
-
156
- specific_promiscuity_models = [
157
- "promiscuity_fxp0_pxf0",
158
- "promiscuity_fxp1_pxf0",
159
- "promiscuity_fxp2_pxf0",
160
- "promiscuity_fxp0_pxf1",
161
- "promiscuity_fxp1_pxf1",
162
- "promiscuity_fxp2_pxf1",
163
- "promiscuity_fxp0_pxf2",
164
- "promiscuity_fxp1_pxf2",
165
- "promiscuity_fxp2_pxf2",
166
- ]
167
- st.sidebar.text(model_to_markdown(specific_promiscuity_models))
168
-
169
- st.sidebar.markdown("**Aggregated score**")
170
- st.sidebar.text("Sum : Sum of individual promiscuity predictors.")
171
-
172
- st.sidebar.title("Signature models")
173
- signature_models = ["signature_{0}".format(i) for i in range(10)]
174
- st.sidebar.text(model_to_markdown(signature_models))
175
-
176
- st.sidebar.title("Chemical space")
177
- s = ["MW : Molecular weight.",
178
- "LogP : Walden-Crippen LogP.",
179
- "Sim-1 : Tanimoto similarity to the most ",
180
- " similar fragment in the training set.",
181
- "Sim-3 : Tanimoto similarity to the third ",
182
- " most similar fragment in the training set."]
183
-
184
- st.sidebar.text("\n".join(s))
185
-
186
- st.sidebar.text("* The score in parenthesis corresponds to the mean AUROC in 10 train-test splits")
187
-
188
- st.sidebar.markdown("**In the main page...**")
189
- s = textwrap.wrap("1. Percentages in parenthesis denote the percentile of the score across the Enamine collection of FFFs (>250k compounds)", width=60)
190
- st.sidebar.text("\n".join(s))
191
- s = textwrap.wrap("2. The exclamation sign (!) indicates that the corresponding model has an AUROC accuracy below 0.7.", width=60)
192
- st.sidebar.text("\n".join(s))
193
 
194
  placeholder_text = []
195
  keys = random.sample(sorted(enamine_catalog_ids_set), 5)
@@ -197,11 +170,29 @@ for k in keys:
197
  placeholder_text += [random.choice([k, enamine_catalog_dict[k]])]
198
  placeholder_text = "\n".join(placeholder_text)
199
 
200
- text_input = st.text_area(label="Input your fully functionalized fragments:")
201
  inputs = [x.strip(" ") for x in text_input.split("\n")]
202
  inputs = [x for x in inputs if x != ""]
203
  if len(inputs) > 999:
204
- st.error("Please limit the number of input fragments to 999.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
205
 
206
  R = []
207
  all_inputs_are_valid = True
@@ -241,13 +232,50 @@ for i, inp in enumerate(inputs):
241
  all_inputs_are_valid = False
242
  R += [r]
243
 
244
-
245
  def get_fragment_image(smiles):
246
  m = Chem.MolFromSmiles(smiles)
247
  AllChem.Compute2DCoords(m)
248
  im = Draw.MolToImage(m, size=(200, 200))
249
  return im
250
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
251
  if all_inputs_are_valid and len(R) > 0:
252
  sum_of_promiscuities = np.sum(
253
  df_predictions[global_promiscuity_models + specific_promiscuity_models], axis=1
@@ -375,7 +403,6 @@ if all_inputs_are_valid and len(R) > 0:
375
  cols[2].markdown("**Promiscuity**")
376
  sum_prom = np.sum(v[prom_columns])
377
  perc_prom = stats.percentileofscore(sum_of_promiscuities, sum_prom)
378
- cols[2].text("Sum : {0:.2f} ({1:.1f}%)".format(sum_prom, perc_prom))
379
  my_cols = ["Prom-0", "Prom-1", "Prom-2"]
380
  cols[2].text(score_texts(v[my_cols], my_cols))
381
 
@@ -392,9 +419,12 @@ if all_inputs_are_valid and len(R) > 0:
392
  ]
393
  cols[2].text(score_texts(v[my_cols], my_cols))
394
 
 
 
395
  cols[3].markdown("**Signatures**")
396
  my_cols = ["Sign-{0}".format(i) for i in range(10)]
397
  cols[3].text(score_texts(v[my_cols], my_cols))
 
398
 
399
  def convert_df(df):
400
  return df.to_csv(index=False).encode("utf-8")
@@ -404,32 +434,4 @@ if all_inputs_are_valid and len(R) > 0:
404
  st.download_button(
405
  "Download as CSV", csv, "predictions.csv", "text/csv", key="download-csv"
406
  )
407
-
408
- else:
409
- st.info(
410
- "This tool expects fully functionalized fragments (FFF) as input, including the diazirine+alkyne probe (CRF). We have tailored the chemical space of the predictions to FFFs; the app will through an error if any of the input molecules does not contain a CRF region. Enamine provides a good [catalog](https://enamine.net/compound-libraries/fragment-libraries/fully-functionalized-probe-library) of FFFs. For a quick test input, use any of the options below."
411
- )
412
-
413
- example_0 = ["Z5645472552", "Z5645472643", "Z5645472785"]
414
- st.markdown("**Input Enamine FFF identifiers...**")
415
- st.text("\n".join(example_0))
416
-
417
- example_1 = [
418
- "C#CCCC1(CCCNC(=O)C(Cc2c[nH]c3ncccc23)NC(=O)OC(C)(C)C)N=N1",
419
- "C#CCCC1(CCCNC(=O)[C@H]2CCC(=O)NC2)N=N1",
420
- "C#CCCC1(CCCNC(=O)CSc2ncc(C(=O)OCC)c(N)n2)N=N1",
421
- ]
422
- st.markdown("**Input FFF SMILES strings...**")
423
- st.text("\n".join(example_1))
424
-
425
- example_2 = ["C310", "C045", "C391"]
426
- st.markdown("**Input Ligand Discovery identifiers...**")
427
- st.text("\n".join(example_2))
428
-
429
- example_3 = [
430
- "Z5645486561",
431
- "C#CCCCC1(CCCC(=O)N2CCC(C(C(=O)O)c3ccc(C)cc3)CC2)N=N1",
432
- "C279",
433
- ]
434
- st.markdown("**Input a mix of the above identifiers**")
435
- st.text("\n".join(example_3))
 
16
  from rdkit import DataStructs
17
  from rdkit.Chem import Descriptors
18
  from scipy import stats
19
+ import textwrap
20
  from datasets import load_dataset
21
  import requests
22
  from io import BytesIO
 
26
  warnings.filterwarnings('ignore')
27
 
28
  st.set_page_config(
29
+ page_title="Ligand Discovery 4: Fragment Predictions",
30
+ page_icon=":home:",
31
+ layout="wide", # "centered",
32
+ initial_sidebar_state="expanded"
33
  )
34
 
35
+ st.markdown("""
36
+ <style>
37
+ .css-13sdm1b.e16nr0p33 {
38
+ margin-top: -75px;
39
+ }
40
+ </style>
41
+ """, unsafe_allow_html=True)
42
+
43
+ hide_streamlit_style = """
44
+ <style>
45
+ #MainMenu {visibility: hidden;}
46
+ footer {visibility: hidden;}
47
+ #header {visibility: hidden;}
48
+ </style>
49
+ """
50
+ st.markdown(hide_streamlit_style, unsafe_allow_html=True)
51
+
52
  dataset = load_dataset('ligdis/data', data_files={"predictions.csv"})
53
  df_predictions = dataset['train'].to_pandas()
54
 
 
129
  return False
130
  return True
131
 
 
 
 
132
  dataset = load_dataset('ligdis/data', data_files={"model_catalog.csv"})
133
  dm = dataset['train'].to_pandas()
134
  all_models = dm["model_name"].tolist()
 
148
  prom_models = [x for x in dm["model_name"].tolist() if x.startswith("promiscuity")]
149
  sign_models = [x for x in dm["model_name"].tolist() if x.startswith("signature")]
150
 
151
+ global_promiscuity_models = ["promiscuity_pxf0", "promiscuity_pxf1", "promiscuity_pxf2"]
152
+ specific_promiscuity_models = ["promiscuity_fxp0_pxf0", "promiscuity_fxp1_pxf0","promiscuity_fxp2_pxf0", "promiscuity_fxp0_pxf1", "promiscuity_fxp1_pxf1", "promiscuity_fxp2_pxf1", "promiscuity_fxp0_pxf2", "promiscuity_fxp1_pxf2", "promiscuity_fxp2_pxf2"]
153
+
154
  def model_to_markdown(model_names):
155
  items = []
156
  for mn in model_names:
 
162
  markdown_list = "\n".join(items)
163
  return markdown_list
164
 
165
+ st.sidebar.title("Ligand Discovery 4: Fragment Predictions")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
166
 
167
  placeholder_text = []
168
  keys = random.sample(sorted(enamine_catalog_ids_set), 5)
 
170
  placeholder_text += [random.choice([k, enamine_catalog_dict[k]])]
171
  placeholder_text = "\n".join(placeholder_text)
172
 
173
+ text_input = st.sidebar.text_area(label="Input your fully functionalized fragments:")
174
  inputs = [x.strip(" ") for x in text_input.split("\n")]
175
  inputs = [x for x in inputs if x != ""]
176
  if len(inputs) > 999:
177
+ st.sidebar.error("Please limit the number of input fragments to 999.")
178
+
179
+ st.sidebar.info("This tool expects fully functionalized fragments (FFF) as input, including the diazirine+alkyne probe (CRF). We have tailored the chemical space of the predictions to FFFs; the app will through an error if any of the input molecules does not contain a CRF region. Enamine provides a good [catalog](https://enamine.net/compound-libraries/fragment-libraries/fully-functionalized-probe-library) of FFFs. For a quick test input, use any of the options below")
180
+
181
+ example_0 = ["Z5645472552", "Z5645472643", "Z5645472785"]
182
+ st.sidebar.markdown("**Input Enamine FFF identifiers...**")
183
+ st.sidebar.text("\n".join(example_0))
184
+
185
+ example_1 = ["C#CCCC1(CCCNC(=O)C(Cc2c[nH]c3ncccc23)NC(=O)OC(C)(C)C)N=N1", "C#CCCC1(CCCNC(=O)[C@H]2CCC(=O)NC2)N=N1", "C#CCCC1(CCCNC(=O)CSc2ncc(C(=O)OCC)c(N)n2)N=N1"]
186
+ st.sidebar.markdown("**Input FFF SMILES strings...**")
187
+ st.sidebar.text("\n".join(example_1))
188
+
189
+ example_2 = ["C310", "C045", "C391"]
190
+ st.sidebar.markdown("**Input Ligand Discovery identifiers...**")
191
+ st.sidebar.text("\n".join(example_2))
192
+
193
+ example_3 = ["Z5645486561", "C#CCCCC1(CCCC(=O)N2CCC(C(C(=O)O)c3ccc(C)cc3)CC2)N=N1", "C279"]
194
+ st.sidebar.markdown("**Input a mix of the above identifiers**")
195
+ st.sidebar.text("\n".join(example_3))
196
 
197
  R = []
198
  all_inputs_are_valid = True
 
232
  all_inputs_are_valid = False
233
  R += [r]
234
 
 
235
  def get_fragment_image(smiles):
236
  m = Chem.MolFromSmiles(smiles)
237
  AllChem.Compute2DCoords(m)
238
  im = Draw.MolToImage(m, size=(200, 200))
239
  return im
240
 
241
+ st.markdown(
242
+ """
243
+ Explanation for Output: The results are displayed in 4 Columns.
244
+ 1. **Structure** of the FFF, InChi, Enamine ID
245
+ 2. **Chemical space**: Displays the Molecular Weight (*MW*), Walden-Crippen *LogP* and Tanimoto Similarity to the most similar fragment (*Sim-1*) and third most similar fragment (*Sim-3*) in the training set
246
+ 3. **Promiscuity Predictions** based on 12 Model: 3 Global (section **A**) and 9 Specific (section **B**)
247
+ 4. **Ontology Predictions** based on 9 _Signature_ Models derived from protein annotations of multiple scopes - from domains and families to molecular functions and cellular localization
248
+ """
249
+ )
250
+
251
+ myCol = st.columns(3)
252
+
253
+ with myCol[0]:
254
+ st.subheader("Promiscuity Predictions")
255
+ st.markdown("**A. Global models**")
256
+ st.text(model_to_markdown(global_promiscuity_models))
257
+ st.markdown("**C. Aggregated score**")
258
+ st.text("Sum : Sum of individual promiscuity predictors")
259
+ with myCol[1]:
260
+ st.text("")
261
+ st.text("")
262
+ st.markdown("**B. Specific models**")
263
+ st.text(model_to_markdown(specific_promiscuity_models))
264
+
265
+ with myCol[2]:
266
+ st.subheader("Ontology Predictions")
267
+ signature_models = ["signature_{0}".format(i) for i in range(10)]
268
+ st.text(model_to_markdown(signature_models))
269
+
270
+ st.markdown(
271
+ """
272
+ - Model score (range 0 -> 1) corresponds to the mean AUROC in 10 train-test splits
273
+ - Percentages in parenthesis denote the percentile of the score across the Enamine collection of FFFs (>250k compounds). for example, in "Sign-4: 0.02 (35.7%)", **35.7** is the percentile of score.
274
+ - The exclamation sign (!) next to the prediction output indicates that the corresponding model has an AUROC accuracy below 0.7 (*! is a warning sign*)
275
+ """
276
+ )
277
+ st.divider()
278
+
279
  if all_inputs_are_valid and len(R) > 0:
280
  sum_of_promiscuities = np.sum(
281
  df_predictions[global_promiscuity_models + specific_promiscuity_models], axis=1
 
403
  cols[2].markdown("**Promiscuity**")
404
  sum_prom = np.sum(v[prom_columns])
405
  perc_prom = stats.percentileofscore(sum_of_promiscuities, sum_prom)
 
406
  my_cols = ["Prom-0", "Prom-1", "Prom-2"]
407
  cols[2].text(score_texts(v[my_cols], my_cols))
408
 
 
419
  ]
420
  cols[2].text(score_texts(v[my_cols], my_cols))
421
 
422
+ cols[2].text("Sum : {0:.2f} ({1:.1f}%)".format(sum_prom, perc_prom))
423
+
424
  cols[3].markdown("**Signatures**")
425
  my_cols = ["Sign-{0}".format(i) for i in range(10)]
426
  cols[3].text(score_texts(v[my_cols], my_cols))
427
+ st.divider()
428
 
429
  def convert_df(df):
430
  return df.to_csv(index=False).encode("utf-8")
 
434
  st.download_button(
435
  "Download as CSV", csv, "predictions.csv", "text/csv", key="download-csv"
436
  )
437
+