Update app.py
Browse files
app.py
CHANGED
|
@@ -21,6 +21,7 @@ from datasets import load_dataset
|
|
| 21 |
import requests
|
| 22 |
from io import BytesIO
|
| 23 |
import urllib.request
|
|
|
|
| 24 |
|
| 25 |
import warnings
|
| 26 |
warnings.filterwarnings('ignore')
|
|
@@ -47,21 +48,21 @@ hide_streamlit_style = """
|
|
| 47 |
#header {visibility: hidden;}
|
| 48 |
</style>
|
| 49 |
"""
|
| 50 |
-
st.markdown(hide_streamlit_style, unsafe_allow_html=True)
|
| 51 |
|
| 52 |
dataset = load_dataset('ligdis/data', data_files={"predictions.csv"})
|
| 53 |
-
df_predictions = dataset['train'].to_pandas()
|
| 54 |
|
| 55 |
predictions_inchikeys = df_predictions["inchikey"].tolist()
|
| 56 |
df_predictions = df_predictions.rename(columns={"inchikey": "InChIKey"})
|
| 57 |
|
| 58 |
dataset = load_dataset('ligdis/data', data_files={"applicability.csv"})
|
| 59 |
-
df_applicability = dataset['train'].to_pandas()
|
| 60 |
|
| 61 |
df_predictions = pd.concat([df_predictions, df_applicability], axis=1)
|
| 62 |
|
| 63 |
dataset = load_dataset('ligdis/data', data_files={"cemm_smiles.csv"})
|
| 64 |
-
cemm_smiles = dataset['train'].to_pandas()
|
| 65 |
|
| 66 |
fid2smi = {}
|
| 67 |
for r in cemm_smiles.values:
|
|
@@ -74,7 +75,7 @@ CRF_PATTERN_0 = "C#CC"
|
|
| 74 |
CRF_PATTERN_1 = "N=N"
|
| 75 |
|
| 76 |
dataset = load_dataset('ligdis/data', data_files={"all_fff_enamine.csv"})
|
| 77 |
-
enamine_catalog = dataset['train'].to_pandas()
|
| 78 |
enamine_catalog_ids_set = set(enamine_catalog["catalog_id"])
|
| 79 |
enamine_catalog_dict = {}
|
| 80 |
catalog2inchikey = {}
|
|
@@ -130,11 +131,11 @@ def has_crf(mol):
|
|
| 130 |
return True
|
| 131 |
|
| 132 |
dataset = load_dataset('ligdis/data', data_files={"model_catalog.csv"})
|
| 133 |
-
dm = dataset['train'].to_pandas()
|
| 134 |
all_models = dm["model_name"].tolist()
|
| 135 |
|
| 136 |
dataset = load_dataset('ligdis/data', data_files={"models_performance.tsv"})
|
| 137 |
-
dp = dataset['train'].to_pandas()
|
| 138 |
|
| 139 |
model_display = {}
|
| 140 |
model_description = {}
|
|
@@ -149,8 +150,8 @@ prom_models = [x for x in dm["model_name"].tolist() if x.startswith("promiscuity
|
|
| 149 |
sign_models = [x for x in dm["model_name"].tolist() if x.startswith("signature")]
|
| 150 |
|
| 151 |
global_promiscuity_models = ["promiscuity_pxf0", "promiscuity_pxf1", "promiscuity_pxf2"]
|
| 152 |
-
specific_promiscuity_models = ["promiscuity_fxp0_pxf0", "promiscuity_fxp1_pxf0","promiscuity_fxp2_pxf0", "promiscuity_fxp0_pxf1", "promiscuity_fxp1_pxf1", "promiscuity_fxp2_pxf1", "promiscuity_fxp0_pxf2", "promiscuity_fxp1_pxf2", "promiscuity_fxp2_pxf2"]
|
| 153 |
-
|
| 154 |
def model_to_markdown(model_names):
|
| 155 |
items = []
|
| 156 |
for mn in model_names:
|
|
@@ -240,8 +241,8 @@ def get_fragment_image(smiles):
|
|
| 240 |
|
| 241 |
st.markdown(
|
| 242 |
"""
|
| 243 |
-
Explanation for Output: The results are displayed in 4 Columns.
|
| 244 |
-
1. **Structure** of the FFF, InChi, Enamine ID
|
| 245 |
2. **Chemical space**: Displays the Molecular Weight (*MW*), Walden-Crippen *LogP* and Tanimoto Similarity to the most similar fragment (*Sim-1*) and third most similar fragment (*Sim-3*) in the training set
|
| 246 |
3. **Promiscuity Predictions** based on 12 Model: 3 Global (section **A**) and 9 Specific (section **B**)
|
| 247 |
4. **Ontology Predictions** based on 9 _Signature_ Models derived from protein annotations of multiple scopes - from domains and families to molecular functions and cellular localization
|
|
@@ -249,7 +250,7 @@ st.markdown(
|
|
| 249 |
)
|
| 250 |
|
| 251 |
myCol = st.columns(3)
|
| 252 |
-
|
| 253 |
with myCol[0]:
|
| 254 |
st.subheader("Promiscuity Predictions")
|
| 255 |
st.markdown("**A. Global models**")
|
|
@@ -270,7 +271,7 @@ with myCol[2]:
|
|
| 270 |
st.markdown(
|
| 271 |
"""
|
| 272 |
- Model score (range 0 -> 1) corresponds to the mean AUROC in 10 train-test splits
|
| 273 |
-
- Percentages in parenthesis denote the percentile of the score across the Enamine collection of FFFs (>250k compounds). for example, in "Sign-4: 0.02 (35.7%)", **35.7** is the percentile of score.
|
| 274 |
- The exclamation sign (!) next to the prediction output indicates that the corresponding model has an AUROC accuracy below 0.7 (*! is a warning sign*)
|
| 275 |
"""
|
| 276 |
)
|
|
@@ -434,4 +435,4 @@ if all_inputs_are_valid and len(R) > 0:
|
|
| 434 |
st.download_button(
|
| 435 |
"Download as CSV", csv, "predictions.csv", "text/csv", key="download-csv"
|
| 436 |
)
|
| 437 |
-
|
|
|
|
| 21 |
import requests
|
| 22 |
from io import BytesIO
|
| 23 |
import urllib.request
|
| 24 |
+
# import miniautoml
|
| 25 |
|
| 26 |
import warnings
|
| 27 |
warnings.filterwarnings('ignore')
|
|
|
|
| 48 |
#header {visibility: hidden;}
|
| 49 |
</style>
|
| 50 |
"""
|
| 51 |
+
st.markdown(hide_streamlit_style, unsafe_allow_html=True)
|
| 52 |
|
| 53 |
dataset = load_dataset('ligdis/data', data_files={"predictions.csv"})
|
| 54 |
+
df_predictions = dataset['train'].to_pandas()
|
| 55 |
|
| 56 |
predictions_inchikeys = df_predictions["inchikey"].tolist()
|
| 57 |
df_predictions = df_predictions.rename(columns={"inchikey": "InChIKey"})
|
| 58 |
|
| 59 |
dataset = load_dataset('ligdis/data', data_files={"applicability.csv"})
|
| 60 |
+
df_applicability = dataset['train'].to_pandas()
|
| 61 |
|
| 62 |
df_predictions = pd.concat([df_predictions, df_applicability], axis=1)
|
| 63 |
|
| 64 |
dataset = load_dataset('ligdis/data', data_files={"cemm_smiles.csv"})
|
| 65 |
+
cemm_smiles = dataset['train'].to_pandas()
|
| 66 |
|
| 67 |
fid2smi = {}
|
| 68 |
for r in cemm_smiles.values:
|
|
|
|
| 75 |
CRF_PATTERN_1 = "N=N"
|
| 76 |
|
| 77 |
dataset = load_dataset('ligdis/data', data_files={"all_fff_enamine.csv"})
|
| 78 |
+
enamine_catalog = dataset['train'].to_pandas()
|
| 79 |
enamine_catalog_ids_set = set(enamine_catalog["catalog_id"])
|
| 80 |
enamine_catalog_dict = {}
|
| 81 |
catalog2inchikey = {}
|
|
|
|
| 131 |
return True
|
| 132 |
|
| 133 |
dataset = load_dataset('ligdis/data', data_files={"model_catalog.csv"})
|
| 134 |
+
dm = dataset['train'].to_pandas()
|
| 135 |
all_models = dm["model_name"].tolist()
|
| 136 |
|
| 137 |
dataset = load_dataset('ligdis/data', data_files={"models_performance.tsv"})
|
| 138 |
+
dp = dataset['train'].to_pandas()
|
| 139 |
|
| 140 |
model_display = {}
|
| 141 |
model_description = {}
|
|
|
|
| 150 |
sign_models = [x for x in dm["model_name"].tolist() if x.startswith("signature")]
|
| 151 |
|
| 152 |
global_promiscuity_models = ["promiscuity_pxf0", "promiscuity_pxf1", "promiscuity_pxf2"]
|
| 153 |
+
specific_promiscuity_models = ["promiscuity_fxp0_pxf0", "promiscuity_fxp1_pxf0","promiscuity_fxp2_pxf0", "promiscuity_fxp0_pxf1", "promiscuity_fxp1_pxf1", "promiscuity_fxp2_pxf1", "promiscuity_fxp0_pxf2", "promiscuity_fxp1_pxf2", "promiscuity_fxp2_pxf2"]
|
| 154 |
+
|
| 155 |
def model_to_markdown(model_names):
|
| 156 |
items = []
|
| 157 |
for mn in model_names:
|
|
|
|
| 241 |
|
| 242 |
st.markdown(
|
| 243 |
"""
|
| 244 |
+
Explanation for Output: The results are displayed in 4 Columns.
|
| 245 |
+
1. **Structure** of the FFF, InChi, Enamine ID
|
| 246 |
2. **Chemical space**: Displays the Molecular Weight (*MW*), Walden-Crippen *LogP* and Tanimoto Similarity to the most similar fragment (*Sim-1*) and third most similar fragment (*Sim-3*) in the training set
|
| 247 |
3. **Promiscuity Predictions** based on 12 Model: 3 Global (section **A**) and 9 Specific (section **B**)
|
| 248 |
4. **Ontology Predictions** based on 9 _Signature_ Models derived from protein annotations of multiple scopes - from domains and families to molecular functions and cellular localization
|
|
|
|
| 250 |
)
|
| 251 |
|
| 252 |
myCol = st.columns(3)
|
| 253 |
+
|
| 254 |
with myCol[0]:
|
| 255 |
st.subheader("Promiscuity Predictions")
|
| 256 |
st.markdown("**A. Global models**")
|
|
|
|
| 271 |
st.markdown(
|
| 272 |
"""
|
| 273 |
- Model score (range 0 -> 1) corresponds to the mean AUROC in 10 train-test splits
|
| 274 |
+
- Percentages in parenthesis denote the percentile of the score across the Enamine collection of FFFs (>250k compounds). for example, in "Sign-4: 0.02 (35.7%)", **35.7** is the percentile of score.
|
| 275 |
- The exclamation sign (!) next to the prediction output indicates that the corresponding model has an AUROC accuracy below 0.7 (*! is a warning sign*)
|
| 276 |
"""
|
| 277 |
)
|
|
|
|
| 435 |
st.download_button(
|
| 436 |
"Download as CSV", csv, "predictions.csv", "text/csv", key="download-csv"
|
| 437 |
)
|
| 438 |
+
|