Spaces:

QCDevs
/

b3clf

Running

App Files Files Community

legend1234 commited on Jan 19, 2025

Commit

ec17199

1 Parent(s): 77abf89

Add working webserver

Browse files

Files changed (7) hide show

README.md +2 -2
app.py +404 -0
packages.txt +1 -0
requirements.txt +17 -0
sample_input.sdf +387 -0
sample_input_smiles.csv +6 -0
utils.py +172 -0

README.md CHANGED Viewed

@@ -1,8 +1,8 @@
 ---
 title: B3lcf
 emoji: 🏆
-colorFrom: yellow
-colorTo: green
 sdk: streamlit
 sdk_version: 1.41.1
 app_file: app.py

 ---
 title: B3lcf
 emoji: 🏆
+colorFrom: blue
+colorTo: pink
 sdk: streamlit
 sdk_version: 1.41.1
 app_file: app.py

app.py ADDED Viewed

	@@ -0,0 +1,404 @@

+import itertools as it
+import os
+import tempfile
+from io import StringIO
+import joblib
+import numpy as np
+import pandas as pd
+import pkg_resources
+# page set up
+import streamlit as st
+from b3clf.descriptor_padel import compute_descriptors
+from b3clf.geometry_opt import geometry_optimize
+from b3clf.utils import get_descriptors, scale_descriptors, select_descriptors
+# from PIL import Image
+from streamlit_extras.let_it_rain import rain
+from streamlit_ketcher import st_ketcher
+from utils import generate_predictions, load_all_models
+st.cache_data.clear()
+st.set_page_config(
+    page_title="BBB Permeability Prediction with Imbalanced Learning",
+    # page_icon="🧊",
+    layout="wide",
+    # initial_sidebar_state="expanded",
+    # menu_items={
+    #     "Get Help": "https://www.extremelycoolapp.com/help",
+    #     "Report a bug": "https://www.extremelycoolapp.com/bug",
+    #     "About": "# This is a header. This is an *extremely* cool app!"
+    # }
+)
+keep_features = "no"
+keep_sdf = "no"
+classifiers_dict = {
+    "decision tree": "dtree",
+    "kNN": "knn",
+    "logistic regression": "logreg",
+    "XGBoost": "xgb",
+}
+resample_methods_dict = {
+    "random undersampling": "classic_RandUndersampling",
+    "SMOTE": "classic_SMOTE",
+    "Borderline SMOTE": "borderline_SMOTE",
+    "k-means SMOTE": "kmeans_SMOTE",
+    "ADASYN": "classic_ADASYN",
+    "no resampling": "common",
+}
+pandas_display_options = {
+    "line_limit": 50,
+}
+mol_features = None
+info_df = None
+results = None
+temp_file_path = None
+all_models = load_all_models()
+# Initialize global variables and cleanup function
+if 'temp_dir' not in st.session_state:
+    st.session_state.temp_dir = None
+if 'processing' not in st.session_state:
+    st.session_state.processing = False
+def cleanup_temp_files():
+    """Clean up temporary directory and files"""
+    if st.session_state.temp_dir and os.path.exists(st.session_state.temp_dir):
+        try:
+            import shutil
+            shutil.rmtree(st.session_state.temp_dir)
+            st.session_state.temp_dir = None
+        except Exception as e:
+            st.error(f"Error cleaning up temporary files: {e}")
+def clear_cache():
+    """Clear Streamlit cache and session state data"""
+    st.cache_data.clear()
+    st.cache_resource.clear()
+    if 'mol_features' in st.session_state:
+        st.session_state.mol_features = None
+    if 'info_df' in st.session_state:
+        st.session_state.info_df = None
+    cleanup_temp_files()
+# Create the Streamlit app
+st.title(":blue[BBB Permeability Prediction with Imbalanced Learning]")
+info_column, upload_column = st.columns(2)
+# inatialize the molecule features and info dataframe session state
+if "mol_features" not in st.session_state:
+    st.session_state.mol_features = None
+if "info_df" not in st.session_state:
+    st.session_state.info_df = None
+if "classifier" not in st.session_state:
+    st.session_state.classifier = "XGBoost"
+if "resampler" not in st.session_state:
+    st.session_state.resampler = "ADASYN"
+if "historical_data" not in st.session_state:
+    st.session_state.historical_data = []
+# download sample files
+with info_column:
+    st.subheader("About `B3clf`")
+    # fmt: off
+    st.markdown(
+        """
+        `B3clf` is a Python package for predicting the blood-brain barrier (BBB) permeability of small molecules using imbalanced learning. It supports decision tree, XGBoost, kNN, logistical regression and 5 resampling strategies (SMOTE, Borderline SMOTE, k-means SMOTE and ADASYN). The workflow of `B3clf` is summarized as below. The Source code and more details are available at https://github.com/theochem/B3clf. This project is supported by Digital Research Alliance of Canada (originally known as Compute Canada) and NSERC. This project is maintained by QC-Dev comminity. For further information and inquiries please contact us at qcdevs@gmail.com."""
+    )
+    st.text(" \n")
+    # text_body = """
+    # `B3clf` is a Python package for predicting the blood-brain barrier (BBB) permeability of small molecules using imbalanced learning. It supports decision tree, XGBoost, kNN, logistical regression and 5 resampling strategies (SMOTE, Borderline SMOTE, k-means SMOTE and ADASYN). The workflow of `B3clf` is summarized as below. The Source code and more details are available at https://github.com/theochem/B3clf.
+    # """
+    # st.markdown(f"<p align="justify">{text_body}</p>",
+    #             unsafe_allow_html=True)
+    # image = Image.open("images/b3clf_workflow.png")
+    # st.image(image=image, use_column_width=True)
+    # image_path = "images/b3clf_workflow.png"
+    # image_width_percent = 80
+    # info_column.markdown(
+    #     f"<img src="{image_path}" style="max-width: {image_width_percent}%; height: auto;">",
+    #     unsafe_allow_html=True
+    #     )
+    # fmt: on
+    sdf_col, smi_col = st.columns(2)
+    with sdf_col:
+        # uneven columns
+        # st.columns((2, 1, 1, 1))
+        # two subcolumns for sample input files
+        # download sample sdf
+        # st.markdown(" \n \n")
+        with open("sample_input.sdf", "r") as file_sdf:
+            btn = st.download_button(
+                label="Download SDF sample file",
+                data=file_sdf,
+                file_name="sample_input.sdf",
+            )
+    with smi_col:
+        with open("sample_input_smiles.csv", "r") as file_smi:
+            btn = st.download_button(
+                label="Download SMILES sample file",
+                data=file_smi,
+                file_name="sample_input_smiles.csv",
+            )
+# Create a file uploader
+with upload_column:
+    st.subheader("Model Selection")
+    with st.container():
+        algorithm_col, resampler_col = st.columns(2)
+        # algorithm and resampling method selection column
+        with algorithm_col:
+            classifier = st.selectbox(
+                label="Classification Algorithm:",
+                options=("XGBoost", "kNN", "decision tree", "logistic regression"),
+                key="classifier",
+                help="Select the classification algorithm to use"
+            )
+        with resampler_col:
+            resampler = st.selectbox(
+                label="Resampling Method:",
+                options=(
+                    "ADASYN",
+                    "random undersampling",
+                    "Borderline SMOTE",
+                    "k-means SMOTE",
+                    "SMOTE",
+                    "no resampling",
+                ),
+                key="resampler",
+                help="Select the resampling method to handle imbalanced data"
+            )
+        # Update session state based on selections
+        if "classifier" not in st.session_state:
+            st.session_state.classifier = classifier
+        if "resampler" not in st.session_state:
+            st.session_state.resampler = resampler
+        # horizontal line
+        st.divider()
+        # upload_col, submit_job_col = st.columns((2, 1))
+        upload_col, _, submit_job_col, _ = st.columns((4, 0.05, 1, 0.05))
+        # upload file column
+        with upload_col:
+            # session state tracking of the file uploader
+            if "uploaded_file" not in st.session_state:
+                st.session_state.uploaded_file = None
+            if "uploaded_file_changed" not in st.session_state:
+                st.session_state.uploaded_file_changed = False
+            # def update_uploader_session_info():
+            #     """Update the session state of the file uploader."""
+            #     st.session_state.uploaded_file = uploaded_file
+            uploaded_file = st.file_uploader(
+                label="Upload a CSV, SDF, TXT or SMI file",
+                type=["csv", "sdf", "txt", "smi"],
+                help="Input molecule file only supports *.csv, *.sdf, *.txt and *.smi.",
+                accept_multiple_files=False,
+                # key="uploaded_file",
+                # on_change=update_uploader_session_info,
+            )
+            if uploaded_file:
+                # st.write(f"the uploaded file: {uploaded_file}")
+                # when new file is uploaded is different from the previous one
+                if st.session_state.uploaded_file != uploaded_file:
+                    st.session_state.uploaded_file_changed = True
+                else:
+                    st.session_state.uploaded_file_changed = False
+                st.session_state.uploaded_file = uploaded_file
+                # when new file is the same as the previous one
+                # else:
+                #     st.session_state.uploaded_file_changed = False
+                # st.session_state.uploaded_file = uploaded_file
+            # set session state for the file uploader
+            # st.write(f"the state of uploaded file: {st.session_state.uploaded_file}")
+            # st.write(f"the state of uploaded file changed: {st.session_state.uploaded_file_changed}")
+        # submit job column
+        with submit_job_col:
+            st.text(" \n")
+            st.text(" \n")
+            st.markdown(
+                "<div style='display: flex; justify-content: center;'>",
+                unsafe_allow_html=True,
+            )
+            submit_job_button = st.button(
+                label="Submit Job",
+                type="secondary",
+                key="job_button",
+                help="Click to start calculations with current configuration"
+            )
+        if not submit_job_button:
+            if "results" in locals():
+                del results
+            if "mol_features" in locals():
+                del mol_features
+            if "info_df" in locals():
+                del info_df
+# Display sections
+feature_column, prediction_column = st.columns(2)
+with feature_column:
+    st.subheader("Molecular Features")
+    placeholder_features = st.empty()
+with prediction_column:
+    st.subheader("Predictions")
+# Only process when Submit Job is clicked
+if submit_job_button:
+    if not uploaded_file and not st.session_state.mol_features:
+        st.warning("Please upload a file first or select data from history to process.")
+    else:
+        if st.session_state.processing:
+            st.warning("A job is already running. Please wait for it to complete.")
+        else:
+            try:
+                st.session_state.processing = True
+                with st.spinner('Processing... Please wait.'):
+                    # Clean up previous files and cache
+                    cleanup_temp_files()
+                    clear_cache()
+                    # Case 1: New file uploaded
+                    if uploaded_file:
+                        # Create new temporary directory
+                        st.session_state.temp_dir = tempfile.mkdtemp()
+                        temp_file_path = os.path.join(st.session_state.temp_dir, uploaded_file.name)
+                        with open(temp_file_path, "wb") as temp_file:
+                            temp_file.write(uploaded_file.read())
+                        # Store current data in history before processing new data
+                        if st.session_state.mol_features is not None and st.session_state.info_df is not None:
+                            st.session_state.historical_data.append({
+                                'mol_features': st.session_state.mol_features.copy(),
+                                'info_df': st.session_state.info_df.copy()
+                            })
+                        # Clear current data
+                        st.session_state.mol_features = None
+                        st.session_state.info_df = None
+                        try:
+                            mol_features, info_df, results = generate_predictions(
+                                input_fname=temp_file_path,
+                                sep="\s+|\t+",
+                                clf=classifiers_dict[st.session_state.classifier],
+                                _models_dict=all_models,
+                                sampling=resample_methods_dict[st.session_state.resampler],
+                                time_per_mol=120,
+                                mol_features=None,
+                                info_df=None,
+                            )
+                        finally:
+                            # Clean up temporary files after processing
+                            cleanup_temp_files()
+                    # Case 2: Recalculate with existing data
+                    else:
+                        mol_features, info_df, results = generate_predictions(
+                            input_fname=None,
+                            sep="\s+|\t+",
+                            clf=classifiers_dict[st.session_state.classifier],
+                            _models_dict=all_models,
+                            sampling=resample_methods_dict[st.session_state.resampler],
+                            time_per_mol=120,
+                            mol_features=st.session_state.mol_features,
+                            info_df=st.session_state.info_df,
+                        )
+                    # Update session state with new results
+                    if mol_features is not None and info_df is not None:
+                        st.session_state.mol_features = mol_features
+                        st.session_state.info_df = info_df
+            except Exception as e:
+                st.error(f"Error during processing: {str(e)}")
+            finally:
+                st.session_state.processing = False
+            # Display results
+            # feture table
+            with feature_column:
+                if st.session_state.mol_features is not None:
+                    selected_feature_rows = np.min(
+                        [st.session_state.mol_features.shape[0], pandas_display_options["line_limit"]]
+                    )
+                    st.dataframe(st.session_state.mol_features.iloc[:selected_feature_rows, :], hide_index=False)
+                    # placeholder_features.dataframe(mol_features, hide_index=False)
+                    feature_file_name = uploaded_file.name.split(".")[0] + "_b3clf_features.csv"
+                    features_csv = st.session_state.mol_features.to_csv(index=True)
+                    st.download_button(
+                        "Download features as CSV",
+                        data=features_csv,
+                        file_name=feature_file_name,
+                    )
+            # prediction table
+            with prediction_column:
+                # st.subheader("Predictions")
+                if results is not None:
+                    # Display the predictions in a table
+                    selected_result_rows = np.min(
+                        [results.shape[0], pandas_display_options["line_limit"]]
+                    )
+                    results_df_display = results.iloc[:selected_result_rows, :].style.format(
+                        {"B3clf_predicted_probability": "{:.6f}".format}
+                    )
+                    st.dataframe(results_df_display, hide_index=True)
+                    # Add a button to download the predictions as a CSV file
+                    predictions_csv = results.to_csv(index=True)
+                    results_file_name = (
+                        uploaded_file.name.split(".")[0] + "_b3clf_predictions.csv"
+                    )
+                    st.download_button(
+                        "Download predictions as CSV",
+                        data=predictions_csv,
+                        file_name=results_file_name,
+                    )
+                    # indicate the success of the job
+                    # rain(
+                    #     emoji="🎈",
+                    #     font_size=54,
+                    #     falling_speed=5,
+                    #     animation_length=10,
+                    # )
+            st.balloons()
+# hide footer
+# https://github.com/streamlit/streamlit/issues/892
+hide_streamlit_style = """
+            <style>
+            #MainMenu {visibility: hidden;}
+            footer {visibility: hidden;}
+            </style>
+            """
+st.markdown(hide_streamlit_style, unsafe_allow_html=True)
+# add google analytics
+st.markdown(
+    """
+    <!-- Google tag (gtag.js) -->
+    <script async src="https://www.googletagmanager.com/gtag/js?id=G-WG8QYRELP9"></script>
+    <script>
+      window.dataLayer = window.dataLayer || [];
+      function gtag(){dataLayer.push(arguments);}
+      gtag("js", new Date());
+      gtag("config", "G-WG8QYRELP9");
+    </script>
+    """,
+    unsafe_allow_html=True,
+)

packages.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ default-jre

requirements.txt ADDED Viewed

	@@ -0,0 +1,17 @@

+numpy==1.24.4
+scipy==1.10.1
+scikit-learn==0.24.2
+joblib==1.3.2
+pandas==2.0.3
+openpyxl==3.1.2
+xgboost==1.4.2
+padelpy>=0.1.11
+rdkit==2023.03.3
+# streamlit-extra==0.3.4
+git+https://github.com/arnaudmiribel/streamlit-extras@v0.3.4
+# for visualization
+streamlit-ketcher
+# for single molecule
+# py3Dmol==2.0.0.post2
+# stmol==0.0.9
+git+https://github.com/theochem/B3clf.git

sample_input.sdf ADDED Viewed

	@@ -0,0 +1,387 @@

+H1_Bepotastine
+     RDKit          3D
+ 52 54  0  0  1  0  0  0  0  0999 V2000
+    6.2601    3.8627   -0.7580 Cl  0  0  0  0  0  0  0  0  0  0  0  0
+    0.7350    0.2169   -0.1032 O   0  0  0  0  0  0  0  0  0  0  0  0
+   -7.2627    2.0029   -1.7812 O   0  0  0  0  0  0  0  0  0  0  0  0
+   -7.8739   -0.0429   -1.1421 O   0  0  0  0  0  0  0  0  0  0  0  0
+   -3.2826    0.1387    1.0997 N   0  0  0  0  0  0  0  0  0  0  0  0
+    2.0420   -2.0119   -1.2138 N   0  0  0  0  0  0  0  0  0  0  0  0
+   -0.4341   -0.2713    0.5552 C   0  0  0  0  0  0  0  0  0  0  0  0
+   -1.5088   -0.5144   -0.4974 C   0  0  0  0  0  0  0  0  0  0  0  0
+   -0.9255    0.7694    1.5572 C   0  0  0  0  0  0  0  0  0  0  0  0
+   -2.8345   -0.8975    0.1550 C   0  0  0  0  0  0  0  0  0  0  0  0
+   -2.2740    0.3674    2.1479 C   0  0  0  0  0  0  0  0  0  0  0  0
+   -4.5811   -0.1850    1.7144 C   0  0  0  0  0  0  0  0  0  0  0  0
+   -5.7574   -0.2607    0.7330 C   0  0  0  0  0  0  0  0  0  0  0  0
+    1.9672   -0.2099    0.5040 C   0  0  2  0  0  0  0  0  0  0  0  0
+   -5.9298    1.0111   -0.0974 C   0  0  0  0  0  0  0  0  0  0  0  0
+    3.0410    0.8232    0.1855 C   0  0  0  0  0  0  0  0  0  0  0  0
+    2.3687   -1.6155    0.0463 C   0  0  0  0  0  0  0  0  0  0  0  0
+    3.9935    1.1819    1.1545 C   0  0  0  0  0  0  0  0  0  0  0  0
+    3.1185    1.4155   -1.0867 C   0  0  0  0  0  0  0  0  0  0  0  0
+   -7.1061    0.8976   -1.0266 C   0  0  0  0  0  0  0  0  0  0  0  0
+    3.0746   -2.4482    0.9176 C   0  0  0  0  0  0  0  0  0  0  0  0
+    4.9873    2.1194    0.8610 C   0  0  0  0  0  0  0  0  0  0  0  0
+    4.1084    2.3564   -1.3784 C   0  0  0  0  0  0  0  0  0  0  0  0
+    3.4496   -3.7187    0.4871 C   0  0  0  0  0  0  0  0  0  0  0  0
+    5.0380    2.7045   -0.4026 C   0  0  0  0  0  0  0  0  0  0  0  0
+    2.4252   -3.2455   -1.6060 C   0  0  0  0  0  0  0  0  0  0  0  0
+    3.1214   -4.1271   -0.7990 C   0  0  0  0  0  0  0  0  0  0  0  0
+   -0.2263   -1.2199    1.0679 H   0  0  0  0  0  0  0  0  0  0  0  0
+   -1.6364    0.3807   -1.1209 H   0  0  0  0  0  0  0  0  0  0  0  0
+   -1.1831   -1.3082   -1.1808 H   0  0  0  0  0  0  0  0  0  0  0  0
+   -0.1894    0.8975    2.3595 H   0  0  0  0  0  0  0  0  0  0  0  0
+   -1.0042    1.7496    1.0680 H   0  0  0  0  0  0  0  0  0  0  0  0
+   -3.5642   -1.0250   -0.6514 H   0  0  0  0  0  0  0  0  0  0  0  0
+   -2.7343   -1.8665    0.6611 H   0  0  0  0  0  0  0  0  0  0  0  0
+   -2.1498   -0.5299    2.7684 H   0  0  0  0  0  0  0  0  0  0  0  0
+   -2.6054    1.1766    2.8103 H   0  0  0  0  0  0  0  0  0  0  0  0
+   -4.5185   -1.1314    2.2673 H   0  0  0  0  0  0  0  0  0  0  0  0
+   -4.8272    0.5917    2.4507 H   0  0  0  0  0  0  0  0  0  0  0  0
+   -5.6514   -1.1306    0.0739 H   0  0  0  0  0  0  0  0  0  0  0  0
+   -6.6737   -0.4399    1.3108 H   0  0  0  0  0  0  0  0  0  0  0  0
+    1.8204   -0.2159    1.5927 H   0  0  0  0  0  0  0  0  0  0  0  0
+   -6.0945    1.8686    0.5639 H   0  0  0  0  0  0  0  0  0  0  0  0
+   -5.0396    1.1941   -0.7083 H   0  0  0  0  0  0  0  0  0  0  0  0
+    3.9687    0.7355    2.1458 H   0  0  0  0  0  0  0  0  0  0  0  0
+    2.3964    1.1402   -1.8552 H   0  0  0  0  0  0  0  0  0  0  0  0
+    3.3355   -2.1177    1.9176 H   0  0  0  0  0  0  0  0  0  0  0  0
+    5.7167    2.3889    1.6199 H   0  0  0  0  0  0  0  0  0  0  0  0
+    4.1451    2.8085   -2.3655 H   0  0  0  0  0  0  0  0  0  0  0  0
+    3.9993   -4.3824    1.1485 H   0  0  0  0  0  0  0  0  0  0  0  0
+    2.1492   -3.5132   -2.6219 H   0  0  0  0  0  0  0  0  0  0  0  0
+    3.4047   -5.1069   -1.1664 H   0  0  0  0  0  0  0  0  0  0  0  0
+   -8.0410    1.8004   -2.3409 H   0  0  0  0  0  0  0  0  0  0  0  0
+  1 25  1  0
+  2  7  1  0
+  2 14  1  0
+  3 20  1  0
+  3 52  1  0
+  4 20  2  0
+  5 10  1  0
+  5 11  1  0
+  5 12  1  0
+  6 17  2  0
+  6 26  1  0
+  7  8  1  0
+  7  9  1  0
+  7 28  1  0
+  8 10  1  0
+  8 29  1  0
+  8 30  1  0
+  9 11  1  0
+  9 31  1  0
+  9 32  1  0
+ 10 33  1  0
+ 10 34  1  0
+ 11 35  1  0
+ 11 36  1  0
+ 12 13  1  0
+ 12 37  1  0
+ 12 38  1  0
+ 13 15  1  0
+ 13 39  1  0
+ 13 40  1  0
+ 14 16  1  0
+ 14 17  1  0
+ 14 41  1  1
+ 15 20  1  0
+ 15 42  1  0
+ 15 43  1  0
+ 16 18  2  0
+ 16 19  1  0
+ 17 21  1  0
+ 18 22  1  0
+ 18 44  1  0
+ 19 23  2  0
+ 19 45  1  0
+ 21 24  2  0
+ 21 46  1  0
+ 22 25  2  0
+ 22 47  1  0
+ 23 25  1  0
+ 23 48  1  0
+ 24 27  1  0
+ 24 49  1  0
+ 26 27  2  0
+ 26 50  1  0
+ 27 51  1  0
+M  END
+>  <compoud_name>  (1)
+H1_Bepotastine
+>  <SMILES>  (1)
+[H]OC(=O)C([H])([H])C([H])([H])C([H])([H])N1C([H])([H])C([H])([H])C([H])(OC([H])(c2nc([H])c([H])c([H])c2[H])c2c([H])c([H])c(Cl)c([H])c2[H])C([H])([H])C1([H])[H]
+>  <cid>  (1)
+2350
+>  <category>  (1)
+N
+>  <inchi>  (1)
+InChI=1S/C21H25ClN2O3/c22-17-8-6-16(7-9-17)21(19-4-1-2-12-23-19)27-18-10-14-24(15-11-18)13-3-5-20(25)26/h1-2,4,6-9,12,18,21H,3,5,10-11,13-15H2,(H,25,26)/t21-/m1/s1
+>  <Energy>  (1)
+49.1758
+$$$$
+H1_Quifenadine
+     RDKit          3D
+ 45 48  0  0  1  0  0  0  0  0999 V2000
+    0.1106    0.2102   -1.7897 O   0  0  0  0  0  0  0  0  0  0  0  0
+    3.4646    1.0770   -0.0854 N   0  0  0  0  0  0  0  0  0  0  0  0
+    2.0931   -1.1209    0.1252 C   0  0  0  0  0  0  0  0  0  0  0  0
+    1.1729    0.1166    0.3820 C   0  0  1  0  0  0  0  0  0  0  0  0
+    2.0299    1.3864    0.1159 C   0  0  0  0  0  0  0  0  0  0  0  0
+    2.7971   -1.0339   -1.2379 C   0  0  0  0  0  0  0  0  0  0  0  0
+    3.2148   -1.0584    1.1848 C   0  0  0  0  0  0  0  0  0  0  0  0
+    3.5902    0.2772   -1.3240 C   0  0  0  0  0  0  0  0  0  0  0  0
+    3.9592    0.2796    1.0561 C   0  0  0  0  0  0  0  0  0  0  0  0
+   -0.2029    0.1255   -0.3860 C   0  0  0  0  0  0  0  0  0  0  0  0
+   -1.1272    1.3230   -0.0602 C   0  0  0  0  0  0  0  0  0  0  0  0
+   -0.9736   -1.1857   -0.1269 C   0  0  0  0  0  0  0  0  0  0  0  0
+   -1.0387    2.0636    1.1310 C   0  0  0  0  0  0  0  0  0  0  0  0
+   -1.3454   -2.0428   -1.1782 C   0  0  0  0  0  0  0  0  0  0  0  0
+   -2.1533    1.6708   -0.9653 C   0  0  0  0  0  0  0  0  0  0  0  0
+   -1.3459   -1.5543    1.1811 C   0  0  0  0  0  0  0  0  0  0  0  0
+   -1.9065    3.1310    1.3840 C   0  0  0  0  0  0  0  0  0  0  0  0
+   -2.0526   -3.2227   -0.9327 C   0  0  0  0  0  0  0  0  0  0  0  0
+   -3.0179    2.7377   -0.7134 C   0  0  0  0  0  0  0  0  0  0  0  0
+   -2.0493   -2.7364    1.4259 C   0  0  0  0  0  0  0  0  0  0  0  0
+   -2.8897    3.4721    0.4604 C   0  0  0  0  0  0  0  0  0  0  0  0
+   -2.4022   -3.5700    0.3691 C   0  0  0  0  0  0  0  0  0  0  0  0
+    1.5541   -2.0675    0.2237 H   0  0  0  0  0  0  0  0  0  0  0  0
+    0.9532    0.0967    1.4588 H   0  0  0  0  0  0  0  0  0  0  0  0
+    1.6691    1.9630   -0.7430 H   0  0  0  0  0  0  0  0  0  0  0  0
+    1.9423    2.0685    0.9712 H   0  0  0  0  0  0  0  0  0  0  0  0
+    2.0851   -1.1104   -2.0638 H   0  0  0  0  0  0  0  0  0  0  0  0
+    3.4846   -1.8820   -1.3506 H   0  0  0  0  0  0  0  0  0  0  0  0
+    3.9137   -1.8918    1.0436 H   0  0  0  0  0  0  0  0  0  0  0  0
+    2.7942   -1.1596    2.1923 H   0  0  0  0  0  0  0  0  0  0  0  0
+    4.6485    0.0638   -1.5199 H   0  0  0  0  0  0  0  0  0  0  0  0
+    3.2467    0.8670   -2.1831 H   0  0  0  0  0  0  0  0  0  0  0  0
+    3.8541    0.8576    1.9828 H   0  0  0  0  0  0  0  0  0  0  0  0
+    5.0353    0.0986    0.9430 H   0  0  0  0  0  0  0  0  0  0  0  0
+    0.1304    1.1516   -2.0295 H   0  0  0  0  0  0  0  0  0  0  0  0
+   -0.3059    1.8245    1.8958 H   0  0  0  0  0  0  0  0  0  0  0  0
+   -1.0856   -1.7976   -2.2061 H   0  0  0  0  0  0  0  0  0  0  0  0
+   -2.2926    1.0941   -1.8795 H   0  0  0  0  0  0  0  0  0  0  0  0
+   -1.0974   -0.9178    2.0267 H   0  0  0  0  0  0  0  0  0  0  0  0
+   -1.8179    3.6927    2.3110 H   0  0  0  0  0  0  0  0  0  0  0  0
+   -2.3308   -3.8683   -1.7614 H   0  0  0  0  0  0  0  0  0  0  0  0
+   -3.7962    2.9864   -1.4300 H   0  0  0  0  0  0  0  0  0  0  0  0
+   -2.3260   -3.0022    2.4429 H   0  0  0  0  0  0  0  0  0  0  0  0
+   -3.5643    4.2999    0.6616 H   0  0  0  0  0  0  0  0  0  0  0  0
+   -2.9530   -4.4872    0.5586 H   0  0  0  0  0  0  0  0  0  0  0  0
+  1 10  1  0
+  1 35  1  0
+  2  5  1  0
+  2  8  1  0
+  2  9  1  0
+  3  4  1  0
+  3  6  1  0
+  3  7  1  0
+  3 23  1  0
+  4  5  1  0
+  4 10  1  0
+  4 24  1  1
+  5 25  1  0
+  5 26  1  0
+  6  8  1  0
+  6 27  1  0
+  6 28  1  0
+  7  9  1  0
+  7 29  1  0
+  7 30  1  0
+  8 31  1  0
+  8 32  1  0
+  9 33  1  0
+  9 34  1  0
+ 10 11  1  0
+ 10 12  1  0
+ 11 13  2  0
+ 11 15  1  0
+ 12 14  2  0
+ 12 16  1  0
+ 13 17  1  0
+ 13 36  1  0
+ 14 18  1  0
+ 14 37  1  0
+ 15 19  2  0
+ 15 38  1  0
+ 16 20  2  0
+ 16 39  1  0
+ 17 21  2  0
+ 17 40  1  0
+ 18 22  2  0
+ 18 41  1  0
+ 19 21  1  0
+ 19 42  1  0
+ 20 22  1  0
+ 20 43  1  0
+ 21 44  1  0
+ 22 45  1  0
+M  END
+>  <compoud_name>  (2)
+H1_Quifenadine
+>  <SMILES>  (2)
+[H]OC(c1c([H])c([H])c([H])c([H])c1[H])(c1c([H])c([H])c([H])c([H])c1[H])C1([H])C([H])([H])N2C([H])([H])C([H])([H])C1([H])C([H])([H])C2([H])[H]
+>  <cid>  (2)
+65600
+>  <category>  (2)
+N
+>  <inchi>  (2)
+InChI=1S/C20H23NO/c22-20(17-7-3-1-4-8-17,18-9-5-2-6-10-18)19-15-21-13-11-16(19)12-14-21/h1-10,16,19,22H,11-15H2/t19-/m1/s1
+>  <Energy>  (2)
+84.891
+$$$$
+H1_Rupatadine
+     RDKit          3D
+ 56 60  0  0  0  0  0  0  0  0999 V2000
+    6.5298    3.3080    0.0562 Cl  0  0  0  0  0  0  0  0  0  0  0  0
+   -2.1780    1.1440   -0.1081 N   0  0  0  0  0  0  0  0  0  0  0  0
+    1.8055   -2.5028    1.6263 N   0  0  0  0  0  0  0  0  0  0  0  0
+   -6.5347   -0.2932   -1.5666 N   0  0  0  0  0  0  0  0  0  0  0  0
+    0.4984    0.2017    0.7391 C   0  0  0  0  0  0  0  0  0  0  0  0
+   -0.7596   -0.6401    0.9176 C   0  0  0  0  0  0  0  0  0  0  0  0
+    0.1325    1.6779    0.6992 C   0  0  0  0  0  0  0  0  0  0  0  0
+   -1.8276   -0.2907   -0.1321 C   0  0  0  0  0  0  0  0  0  0  0  0
+   -0.9697    1.9571   -0.3378 C   0  0  0  0  0  0  0  0  0  0  0  0
+    1.7535   -0.3064    0.5966 C   0  0  0  0  0  0  0  0  0  0  0  0
+   -3.2065    1.4670   -1.1132 C   0  0  0  0  0  0  0  0  0  0  0  0
+    2.9347    0.5760    0.4016 C   0  0  0  0  0  0  0  0  0  0  0  0
+    1.9383   -1.7730    0.4937 C   0  0  0  0  0  0  0  0  0  0  0  0
+    3.7669    0.4917   -0.7359 C   0  0  0  0  0  0  0  0  0  0  0  0
+    3.6248   -0.5108   -1.8705 C   0  0  0  0  0  0  0  0  0  0  0  0
+    2.3939   -1.4219   -1.9523 C   0  0  0  0  0  0  0  0  0  0  0  0
+    2.2514   -2.3194   -0.7533 C   0  0  0  0  0  0  0  0  0  0  0  0
+   -4.5656    0.8945   -0.7963 C   0  0  0  0  0  0  0  0  0  0  0  0
+    3.2715    1.4705    1.4385 C   0  0  0  0  0  0  0  0  0  0  0  0
+    4.8769    1.3617   -0.8210 C   0  0  0  0  0  0  0  0  0  0  0  0
+    2.4290   -3.7014   -0.8308 C   0  0  0  0  0  0  0  0  0  0  0  0
+    4.3729    2.3200    1.3344 C   0  0  0  0  0  0  0  0  0  0  0  0
+    5.1670    2.2679    0.1982 C   0  0  0  0  0  0  0  0  0  0  0  0
+   -5.1566    1.0467    0.4633 C   0  0  0  0  0  0  0  0  0  0  0  0
+   -5.3042    0.2290   -1.7686 C   0  0  0  0  0  0  0  0  0  0  0  0
+    2.2947   -4.4730    0.3198 C   0  0  0  0  0  0  0  0  0  0  0  0
+    1.9875   -3.8347    1.5112 C   0  0  0  0  0  0  0  0  0  0  0  0
+   -6.4311    0.5316    0.7094 C   0  0  0  0  0  0  0  0  0  0  0  0
+   -7.0633   -0.1364   -0.3325 C   0  0  0  0  0  0  0  0  0  0  0  0
+   -7.0626    0.6338    2.0605 C   0  0  0  0  0  0  0  0  0  0  0  0
+   -0.5731   -1.7154    0.8560 H   0  0  0  0  0  0  0  0  0  0  0  0
+   -1.1596   -0.4557    1.9235 H   0  0  0  0  0  0  0  0  0  0  0  0
+   -0.2119    1.9818    1.6961 H   0  0  0  0  0  0  0  0  0  0  0  0
+    0.9793    2.3217    0.4489 H   0  0  0  0  0  0  0  0  0  0  0  0
+   -1.4699   -0.5848   -1.1284 H   0  0  0  0  0  0  0  0  0  0  0  0
+   -2.7127   -0.8992    0.0866 H   0  0  0  0  0  0  0  0  0  0  0  0
+   -1.2287    3.0211   -0.2712 H   0  0  0  0  0  0  0  0  0  0  0  0
+   -0.5727    1.7824   -1.3473 H   0  0  0  0  0  0  0  0  0  0  0  0
+   -2.8776    1.1445   -2.1102 H   0  0  0  0  0  0  0  0  0  0  0  0
+   -3.3405    2.5558   -1.1674 H   0  0  0  0  0  0  0  0  0  0  0  0
+    3.6660    0.0536   -2.8120 H   0  0  0  0  0  0  0  0  0  0  0  0
+    4.5182   -1.1506   -1.8447 H   0  0  0  0  0  0  0  0  0  0  0  0
+    2.4771   -2.0361   -2.8582 H   0  0  0  0  0  0  0  0  0  0  0  0
+    1.4795   -0.8292   -2.0837 H   0  0  0  0  0  0  0  0  0  0  0  0
+    2.6674    1.5029    2.3444 H   0  0  0  0  0  0  0  0  0  0  0  0
+    5.5326    1.3154   -1.6888 H   0  0  0  0  0  0  0  0  0  0  0  0
+    2.6741   -4.1805   -1.7747 H   0  0  0  0  0  0  0  0  0  0  0  0
+    4.6043    3.0064    2.1437 H   0  0  0  0  0  0  0  0  0  0  0  0
+   -4.6110    1.5606    1.2526 H   0  0  0  0  0  0  0  0  0  0  0  0
+   -4.9162    0.0859   -2.7735 H   0  0  0  0  0  0  0  0  0  0  0  0
+    2.4295   -5.5486    0.2902 H   0  0  0  0  0  0  0  0  0  0  0  0
+    1.8762   -4.3969    2.4339 H   0  0  0  0  0  0  0  0  0  0  0  0
+   -8.0471   -0.5796   -0.2022 H   0  0  0  0  0  0  0  0  0  0  0  0
+   -8.1536    0.6818    1.9793 H   0  0  0  0  0  0  0  0  0  0  0  0
+   -6.7913   -0.2348    2.6683 H   0  0  0  0  0  0  0  0  0  0  0  0
+   -6.7355    1.5422    2.5773 H   0  0  0  0  0  0  0  0  0  0  0  0
+  1 23  1  0
+  2  8  1  0
+  2  9  1  0
+  2 11  1  0
+  3 13  2  0
+  3 27  1  0
+  4 25  2  0
+  4 29  1  0
+  5  6  1  0
+  5  7  1  0
+  5 10  2  3
+  6  8  1  0
+  6 31  1  0
+  6 32  1  0
+  7  9  1  0
+  7 33  1  0
+  7 34  1  0
+  8 35  1  0
+  8 36  1  0
+  9 37  1  0
+  9 38  1  0
+ 10 12  1  0
+ 10 13  1  0
+ 11 18  1  0
+ 11 39  1  0
+ 11 40  1  0
+ 12 14  2  0
+ 12 19  1  0
+ 13 17  1  0
+ 14 15  1  0
+ 14 20  1  0
+ 15 16  1  0
+ 15 41  1  0
+ 15 42  1  0
+ 16 17  1  0
+ 16 43  1  0
+ 16 44  1  0
+ 17 21  2  0
+ 18 24  2  0
+ 18 25  1  0
+ 19 22  2  0
+ 19 45  1  0
+ 20 23  2  0
+ 20 46  1  0
+ 21 26  1  0
+ 21 47  1  0
+ 22 23  1  0
+ 22 48  1  0
+ 24 28  1  0
+ 24 49  1  0
+ 25 50  1  0
+ 26 27  2  0
+ 26 51  1  0
+ 27 52  1  0
+ 28 29  2  0
+ 28 30  1  0
+ 29 53  1  0
+ 30 54  1  0
+ 30 55  1  0
+ 30 56  1  0
+M  END
+>  <compoud_name>  (3)
+H1_Rupatadine
+>  <SMILES>  (3)
+[H]c1nc2c(c([H])c1[H])C([H])([H])C([H])([H])c1c([H])c(Cl)c([H])c([H])c1C2=C1C([H])([H])C([H])([H])N(C([H])([H])c2c([H])nc([H])c(C([H])([H])[H])c2[H])C([H])([H])C1([H])[H]
+>  <cid>  (3)
+133017
+>  <category>  (3)
+N
+>  <inchi>  (3)
+InChI=1S/C26H26ClN3/c1-18-13-19(16-28-15-18)17-30-11-8-20(9-12-30)25-24-7-6-23(27)14-22(24)5-4-21-3-2-10-29-26(21)25/h2-3,6-7,10,13-16H,4-5,8-9,11-12,17H2,1H3
+>  <Energy>  (3)
+119.976
+$$$$

sample_input_smiles.csv ADDED Viewed

	@@ -0,0 +1,6 @@

+OC(=O)CCCN1CCC(OC(c2ncccc2)c2ccc(Cl)cc2)CC1
+OC(c1ccccc1)(c1ccccc1)C1CN2CCC1CC2
+c1nc2c(cc1)CCc1cc(Cl)ccc1C2=C1CCN(Cc2cncc(C)c2)CC1
+C1=CC=C2C(=C1)C=CC3=CC=CC=C3N2C(=O)N
+CC(=O)Oc1ccccc1C(=O)O
+CC(=O)Oc1c(cc(cc1)Cl)C(=O)OC(=O)c1c(ccc(c1)Cl)OC(=O)C

utils.py ADDED Viewed

	@@ -0,0 +1,172 @@

+import itertools as it
+import os
+import joblib
+import numpy as np
+import pandas as pd
+import pkg_resources
+import streamlit as st
+from b3clf.descriptor_padel import compute_descriptors
+from b3clf.geometry_opt import geometry_optimize
+from b3clf.utils import get_descriptors, scale_descriptors, select_descriptors
+@st.cache_resource()
+def load_all_models():
+    """Get b3clf fitted classifier"""
+    clf_list = ["dtree", "knn", "logreg", "xgb"]
+    sampling_list = [
+        "borderline_SMOTE",
+        "classic_ADASYN",
+        "classic_RandUndersampling",
+        "classic_SMOTE",
+        "kmeans_SMOTE",
+        "common",
+    ]
+    model_dict = {}
+    package_name = "b3clf"
+    for clf_str, sampling_str in it.product(clf_list, sampling_list):
+        # joblib_fpath = os.path.join(
+        #     dirname, "pre_trained", "b3clf_{}_{}.joblib".format(clf_str, sampling_str))
+        # pred_model = joblib.load(joblib_fpath)
+        joblib_path_str = f"pre_trained/b3clf_{clf_str}_{sampling_str}.joblib"
+        with pkg_resources.resource_stream(package_name, joblib_path_str) as f:
+            pred_model = joblib.load(f)
+        model_dict[clf_str + "_" + sampling_str] = pred_model
+    return model_dict
+@st.cache_resource
+def predict_permeability(
+    clf_str, sampling_str, _models_dict, mol_features, info_df, threshold="none"
+):
+    """Compute permeability prediction for given feature data."""
+    # load the model
+    # pred_model = load_all_models()[clf_str + "_" + sampling_str]
+    pred_model = _models_dict[clf_str + "_" + sampling_str]
+    # load the threshold data
+    package_name = "b3clf"
+    with pkg_resources.resource_stream(package_name, "data/B3clf_thresholds.xlsx") as f:
+        df_thres = pd.read_excel(f, index_col=0, engine="openpyxl")
+    # default threshold is 0.5
+    label_pool = np.zeros(mol_features.shape[0], dtype=int)
+    if type(mol_features) == pd.DataFrame:
+        if mol_features.index.tolist() != info_df.index.tolist():
+            raise ValueError("Features_df and Info_df do not have the same index.")
+    # get predicted probabilities
+    info_df.loc[:, "B3clf_predicted_probability"] = pred_model.predict_proba(
+        mol_features
+    )[:, 1]
+    # get predicted label from probability using the threshold
+    mask = np.greater_equal(
+        info_df["B3clf_predicted_probability"].to_numpy(),
+        # df_thres.loc[clf_str + "-" + sampling_str, threshold])
+        df_thres.loc["xgb-classic_ADASYN", threshold],
+    )
+    label_pool[mask] = 1
+    # save the predicted labels
+    info_df["B3clf_predicted_label"] = label_pool
+    info_df.reset_index(inplace=True)
+    return info_df
+@st.cache_resource
+def generate_predictions(
+    input_fname: str = None,
+    sep: str = "\s+|\t+",
+    clf: str = "xgb",
+    _models_dict: dict = None,
+    keep_sdf: str = "no",
+    sampling: str = "classic_ADASYN",
+    time_per_mol: int = 120,
+    mol_features: pd.DataFrame = None,
+    info_df: pd.DataFrame = None,
+):
+    """
+    Generate predictions for a given input file.
+    """
+    try:
+        if mol_features is None and info_df is None:
+            if input_fname is None:
+                raise ValueError("Either input_fname or mol_features/info_df must be provided")
+            mol_tag = os.path.basename(input_fname).split(".")[0]
+            file_ext = os.path.splitext(input_fname)[1].lower()
+            internal_sdf = f"{mol_tag}_optimized_3d.sdf"
+            try:
+                # Handle different file types
+                if file_ext == '.csv':
+                    sep = ','
+                elif file_ext == '.txt' or file_ext == '.smi':
+                    sep = '\s+|\t+'
+                elif file_ext != '.sdf':
+                    raise ValueError(f"Unsupported file type: {file_ext}")
+                # Geometry optimization
+                geometry_optimize(input_fname=input_fname, output_sdf=internal_sdf, sep=sep)
+                # Compute descriptors with timeout handling
+                df_features = compute_descriptors(
+                    sdf_file=internal_sdf,
+                    excel_out=None,
+                    output_csv=None,
+                    timeout=time_per_mol * 2,  # Double the per-molecule time for total timeout
+                    time_per_molecule=time_per_mol,
+                )
+                # Get computed descriptors
+                mol_features, info_df = get_descriptors(df=df_features)
+                # Select descriptors
+                mol_features = select_descriptors(df=mol_features)
+                # Scale descriptors
+                mol_features.iloc[:, :] = scale_descriptors(df=mol_features)
+            finally:
+                # Clean up temporary files
+                if os.path.exists(internal_sdf) and keep_sdf == "no":
+                    try:
+                        os.remove(internal_sdf)
+                    except:
+                        pass
+        # Get predictions
+        result_df = predict_permeability(
+            clf_str=clf,
+            sampling_str=sampling,
+            _models_dict=_models_dict,
+            mol_features=mol_features,
+            info_df=info_df,
+            threshold="none",
+        )
+        # Select display columns
+        display_cols = [
+            "ID",
+            "SMILES",
+            "B3clf_predicted_probability",
+            "B3clf_predicted_label",
+        ]
+        result_df = result_df[
+            [col for col in result_df.columns.to_list() if col in display_cols]
+        ]
+        return mol_features, info_df, result_df
+    except Exception as e:
+        import traceback
+        st.error(f"Error in generate_predictions: {str(e)}\n{traceback.format_exc()}")
+        raise