Spaces:

ligdis
/

4

Running

App Files Files Community

ligdis commited on Mar 4, 2025

Commit

7fdd79e

verified ·

1 Parent(s): c03935d

Update app.py

Browse files

Files changed (1) hide show

app.py +88 -86

app.py CHANGED Viewed

@@ -16,6 +16,7 @@ from rdkit.Chem import AllChem
 from rdkit import DataStructs
 from rdkit.Chem import Descriptors
 from scipy import stats
 from datasets import load_dataset
 import requests
 from io import BytesIO
@@ -25,12 +26,29 @@ import warnings
 warnings.filterwarnings('ignore')
 st.set_page_config(
-    page_title="Fragment predictor app",
-    layout="wide",
-    initial_sidebar_state="expanded",
-    page_icon=None,
 )
 dataset = load_dataset('ligdis/data', data_files={"predictions.csv"})
 df_predictions = dataset['train'].to_pandas()
@@ -111,9 +129,6 @@ def has_crf(mol):
             return False
     return True
-st.title("Fully-functionalized fragment predictions")
 dataset = load_dataset('ligdis/data', data_files={"model_catalog.csv"})
 dm = dataset['train'].to_pandas()
 all_models = dm["model_name"].tolist()
@@ -133,6 +148,9 @@ for r in dp.values:
 prom_models = [x for x in dm["model_name"].tolist() if x.startswith("promiscuity")]
 sign_models = [x for x in dm["model_name"].tolist() if x.startswith("signature")]
 def model_to_markdown(model_names):
     items = []
     for mn in model_names:
@@ -144,52 +162,7 @@ def model_to_markdown(model_names):
     markdown_list = "\n".join(items)
     return markdown_list
-st.sidebar.title("Promiscuity models")
-st.sidebar.markdown("**Global models**")
-global_promiscuity_models = ["promiscuity_pxf0", "promiscuity_pxf1", "promiscuity_pxf2"]
-st.sidebar.text(model_to_markdown(global_promiscuity_models))
-st.sidebar.markdown("**Specific models**")
-specific_promiscuity_models = [
-    "promiscuity_fxp0_pxf0",
-    "promiscuity_fxp1_pxf0",
-    "promiscuity_fxp2_pxf0",
-    "promiscuity_fxp0_pxf1",
-    "promiscuity_fxp1_pxf1",
-    "promiscuity_fxp2_pxf1",
-    "promiscuity_fxp0_pxf2",
-    "promiscuity_fxp1_pxf2",
-    "promiscuity_fxp2_pxf2",
-]
-st.sidebar.text(model_to_markdown(specific_promiscuity_models))
-st.sidebar.markdown("**Aggregated score**")
-st.sidebar.text("Sum             : Sum of individual promiscuity predictors.")
-st.sidebar.title("Signature models")
-signature_models = ["signature_{0}".format(i) for i in range(10)]
-st.sidebar.text(model_to_markdown(signature_models))
-st.sidebar.title("Chemical space")
-s = ["MW              : Molecular weight.",
-     "LogP            : Walden-Crippen LogP.",
-     "Sim-1           : Tanimoto similarity to the most ",
-     "                  similar fragment in the training set.",
-     "Sim-3           : Tanimoto similarity to the third ",
-     "                  most similar fragment in the training set."]
-st.sidebar.text("\n".join(s))
-st.sidebar.text("* The score in parenthesis corresponds to the mean AUROC in 10 train-test splits")
-st.sidebar.markdown("**In the main page...**")
-s = textwrap.wrap("1. Percentages in parenthesis denote the percentile of the score across the Enamine collection of FFFs (>250k compounds)", width=60)
-st.sidebar.text("\n".join(s))
-s = textwrap.wrap("2. The exclamation sign (!) indicates that the corresponding model has an AUROC accuracy below 0.7.", width=60)
-st.sidebar.text("\n".join(s))
 placeholder_text = []
 keys = random.sample(sorted(enamine_catalog_ids_set), 5)
@@ -197,11 +170,29 @@ for k in keys:
     placeholder_text += [random.choice([k, enamine_catalog_dict[k]])]
 placeholder_text = "\n".join(placeholder_text)
-text_input = st.text_area(label="Input your fully functionalized fragments:")
 inputs = [x.strip(" ") for x in text_input.split("\n")]
 inputs = [x for x in inputs if x != ""]
 if len(inputs) > 999:
-    st.error("Please limit the number of input fragments to 999.")
 R = []
 all_inputs_are_valid = True
@@ -241,13 +232,50 @@ for i, inp in enumerate(inputs):
         all_inputs_are_valid = False
     R += [r]
 def get_fragment_image(smiles):
     m = Chem.MolFromSmiles(smiles)
     AllChem.Compute2DCoords(m)
     im = Draw.MolToImage(m, size=(200, 200))
     return im
 if all_inputs_are_valid and len(R) > 0:
     sum_of_promiscuities = np.sum(
         df_predictions[global_promiscuity_models + specific_promiscuity_models], axis=1
@@ -375,7 +403,6 @@ if all_inputs_are_valid and len(R) > 0:
         cols[2].markdown("**Promiscuity**")
         sum_prom = np.sum(v[prom_columns])
         perc_prom = stats.percentileofscore(sum_of_promiscuities, sum_prom)
-        cols[2].text("Sum     : {0:.2f} ({1:.1f}%)".format(sum_prom, perc_prom))
         my_cols = ["Prom-0", "Prom-1", "Prom-2"]
         cols[2].text(score_texts(v[my_cols], my_cols))
@@ -392,9 +419,12 @@ if all_inputs_are_valid and len(R) > 0:
         ]
         cols[2].text(score_texts(v[my_cols], my_cols))
         cols[3].markdown("**Signatures**")
         my_cols = ["Sign-{0}".format(i) for i in range(10)]
         cols[3].text(score_texts(v[my_cols], my_cols))
     def convert_df(df):
         return df.to_csv(index=False).encode("utf-8")
@@ -404,32 +434,4 @@ if all_inputs_are_valid and len(R) > 0:
     st.download_button(
         "Download as CSV", csv, "predictions.csv", "text/csv", key="download-csv"
     )
-else:
-    st.info(
-        "This tool expects fully functionalized fragments (FFF) as input, including the diazirine+alkyne probe (CRF). We have tailored the chemical space of the predictions to FFFs; the app will through an error if any of the input molecules does not contain a CRF region. Enamine provides a good [catalog](https://enamine.net/compound-libraries/fragment-libraries/fully-functionalized-probe-library) of FFFs. For a quick test input, use any of the options below."
-    )
-    example_0 = ["Z5645472552", "Z5645472643", "Z5645472785"]
-    st.markdown("**Input Enamine FFF identifiers...**")
-    st.text("\n".join(example_0))
-    example_1 = [
-        "C#CCCC1(CCCNC(=O)C(Cc2c[nH]c3ncccc23)NC(=O)OC(C)(C)C)N=N1",
-        "C#CCCC1(CCCNC(=O)[C@H]2CCC(=O)NC2)N=N1",
-        "C#CCCC1(CCCNC(=O)CSc2ncc(C(=O)OCC)c(N)n2)N=N1",
-    ]
-    st.markdown("**Input FFF SMILES strings...**")
-    st.text("\n".join(example_1))
-    example_2 = ["C310", "C045", "C391"]
-    st.markdown("**Input Ligand Discovery identifiers...**")
-    st.text("\n".join(example_2))
-    example_3 = [
-        "Z5645486561",
-        "C#CCCCC1(CCCC(=O)N2CCC(C(C(=O)O)c3ccc(C)cc3)CC2)N=N1",
-        "C279",
-    ]
-    st.markdown("**Input a mix of the above identifiers**")
-    st.text("\n".join(example_3))

 from rdkit import DataStructs
 from rdkit.Chem import Descriptors
 from scipy import stats
+import textwrap
 from datasets import load_dataset
 import requests
 from io import BytesIO
 warnings.filterwarnings('ignore')
 st.set_page_config(
+    page_title="Ligand Discovery 4: Fragment Predictions",
+    page_icon=":home:",
+    layout="wide", # "centered",
+    initial_sidebar_state="expanded"
 )
+st.markdown("""
+  <style>
+    .css-13sdm1b.e16nr0p33 {
+      margin-top: -75px;
+    }
+  </style>
+""", unsafe_allow_html=True)
+hide_streamlit_style = """
+            <style>
+            #MainMenu {visibility: hidden;}
+            footer {visibility: hidden;}
+            #header {visibility: hidden;}
+            </style>
+            """
+st.markdown(hide_streamlit_style, unsafe_allow_html=True)
 dataset = load_dataset('ligdis/data', data_files={"predictions.csv"})
 df_predictions = dataset['train'].to_pandas()
             return False
     return True
 dataset = load_dataset('ligdis/data', data_files={"model_catalog.csv"})
 dm = dataset['train'].to_pandas()
 all_models = dm["model_name"].tolist()
 prom_models = [x for x in dm["model_name"].tolist() if x.startswith("promiscuity")]
 sign_models = [x for x in dm["model_name"].tolist() if x.startswith("signature")]
+global_promiscuity_models = ["promiscuity_pxf0", "promiscuity_pxf1", "promiscuity_pxf2"]
+specific_promiscuity_models = ["promiscuity_fxp0_pxf0", "promiscuity_fxp1_pxf0","promiscuity_fxp2_pxf0", "promiscuity_fxp0_pxf1", "promiscuity_fxp1_pxf1", "promiscuity_fxp2_pxf1", "promiscuity_fxp0_pxf2", "promiscuity_fxp1_pxf2", "promiscuity_fxp2_pxf2"]
 def model_to_markdown(model_names):
     items = []
     for mn in model_names:
     markdown_list = "\n".join(items)
     return markdown_list
+st.sidebar.title("Ligand Discovery 4: Fragment Predictions")
 placeholder_text = []
 keys = random.sample(sorted(enamine_catalog_ids_set), 5)
     placeholder_text += [random.choice([k, enamine_catalog_dict[k]])]
 placeholder_text = "\n".join(placeholder_text)
+text_input = st.sidebar.text_area(label="Input your fully functionalized fragments:")
 inputs = [x.strip(" ") for x in text_input.split("\n")]
 inputs = [x for x in inputs if x != ""]
 if len(inputs) > 999:
+    st.sidebar.error("Please limit the number of input fragments to 999.")
+st.sidebar.info("This tool expects fully functionalized fragments (FFF) as input, including the diazirine+alkyne probe (CRF). We have tailored the chemical space of the predictions to FFFs; the app will through an error if any of the input molecules does not contain a CRF region. Enamine provides a good [catalog](https://enamine.net/compound-libraries/fragment-libraries/fully-functionalized-probe-library) of FFFs. For a quick test input, use any of the options below")
+example_0 = ["Z5645472552", "Z5645472643", "Z5645472785"]
+st.sidebar.markdown("**Input Enamine FFF identifiers...**")
+st.sidebar.text("\n".join(example_0))
+example_1 = ["C#CCCC1(CCCNC(=O)C(Cc2c[nH]c3ncccc23)NC(=O)OC(C)(C)C)N=N1", "C#CCCC1(CCCNC(=O)[C@H]2CCC(=O)NC2)N=N1", "C#CCCC1(CCCNC(=O)CSc2ncc(C(=O)OCC)c(N)n2)N=N1"]
+st.sidebar.markdown("**Input FFF SMILES strings...**")
+st.sidebar.text("\n".join(example_1))
+example_2 = ["C310", "C045", "C391"]
+st.sidebar.markdown("**Input Ligand Discovery identifiers...**")
+st.sidebar.text("\n".join(example_2))
+example_3 = ["Z5645486561", "C#CCCCC1(CCCC(=O)N2CCC(C(C(=O)O)c3ccc(C)cc3)CC2)N=N1", "C279"]
+st.sidebar.markdown("**Input a mix of the above identifiers**")
+st.sidebar.text("\n".join(example_3))
 R = []
 all_inputs_are_valid = True
         all_inputs_are_valid = False
     R += [r]
 def get_fragment_image(smiles):
     m = Chem.MolFromSmiles(smiles)
     AllChem.Compute2DCoords(m)
     im = Draw.MolToImage(m, size=(200, 200))
     return im
+st.markdown(
+    """
+    Explanation for Output: The results are displayed in 4 Columns.
+    1. **Structure** of the FFF, InChi, Enamine ID
+    2. **Chemical space**: Displays the Molecular Weight (*MW*), Walden-Crippen *LogP* and Tanimoto Similarity to the most similar fragment (*Sim-1*) and third most similar fragment (*Sim-3*) in the training set
+    3. **Promiscuity Predictions** based on 12 Model: 3 Global (section **A**) and 9 Specific (section **B**)
+    4. **Ontology Predictions** based on 9 _Signature_ Models derived from protein annotations of multiple scopes - from domains and families to molecular functions and cellular localization
+    """
+)
+myCol = st.columns(3)
+with myCol[0]:
+    st.subheader("Promiscuity Predictions")
+    st.markdown("**A. Global models**")
+    st.text(model_to_markdown(global_promiscuity_models))
+    st.markdown("**C. Aggregated score**")
+    st.text("Sum : Sum of individual promiscuity predictors")
+with myCol[1]:
+    st.text("")
+    st.text("")
+    st.markdown("**B. Specific models**")
+    st.text(model_to_markdown(specific_promiscuity_models))
+with myCol[2]:
+    st.subheader("Ontology Predictions")
+    signature_models = ["signature_{0}".format(i) for i in range(10)]
+    st.text(model_to_markdown(signature_models))
+st.markdown(
+"""
+- Model score (range 0 -> 1) corresponds to the mean AUROC in 10 train-test splits
+- Percentages in parenthesis denote the percentile of the score across the Enamine collection of FFFs (>250k compounds). for example, in "Sign-4: 0.02 (35.7%)", **35.7** is the percentile of score.
+- The exclamation sign (!) next to the prediction output indicates that the corresponding model has an AUROC accuracy below 0.7 (*! is a warning sign*)
+"""
+)
+st.divider()
 if all_inputs_are_valid and len(R) > 0:
     sum_of_promiscuities = np.sum(
         df_predictions[global_promiscuity_models + specific_promiscuity_models], axis=1
         cols[2].markdown("**Promiscuity**")
         sum_prom = np.sum(v[prom_columns])
         perc_prom = stats.percentileofscore(sum_of_promiscuities, sum_prom)
         my_cols = ["Prom-0", "Prom-1", "Prom-2"]
         cols[2].text(score_texts(v[my_cols], my_cols))
         ]
         cols[2].text(score_texts(v[my_cols], my_cols))
+        cols[2].text("Sum     : {0:.2f} ({1:.1f}%)".format(sum_prom, perc_prom))
         cols[3].markdown("**Signatures**")
         my_cols = ["Sign-{0}".format(i) for i in range(10)]
         cols[3].text(score_texts(v[my_cols], my_cols))
+        st.divider()
     def convert_df(df):
         return df.to_csv(index=False).encode("utf-8")
     st.download_button(
         "Download as CSV", csv, "predictions.csv", "text/csv", key="download-csv"
     )