legend1234 commited on
Commit
ec17199
·
1 Parent(s): 77abf89

Add working webserver

Browse files
Files changed (7) hide show
  1. README.md +2 -2
  2. app.py +404 -0
  3. packages.txt +1 -0
  4. requirements.txt +17 -0
  5. sample_input.sdf +387 -0
  6. sample_input_smiles.csv +6 -0
  7. utils.py +172 -0
README.md CHANGED
@@ -1,8 +1,8 @@
1
  ---
2
  title: B3lcf
3
  emoji: 🏆
4
- colorFrom: yellow
5
- colorTo: green
6
  sdk: streamlit
7
  sdk_version: 1.41.1
8
  app_file: app.py
 
1
  ---
2
  title: B3lcf
3
  emoji: 🏆
4
+ colorFrom: blue
5
+ colorTo: pink
6
  sdk: streamlit
7
  sdk_version: 1.41.1
8
  app_file: app.py
app.py ADDED
@@ -0,0 +1,404 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import itertools as it
2
+ import os
3
+ import tempfile
4
+ from io import StringIO
5
+
6
+ import joblib
7
+ import numpy as np
8
+ import pandas as pd
9
+ import pkg_resources
10
+ # page set up
11
+ import streamlit as st
12
+ from b3clf.descriptor_padel import compute_descriptors
13
+ from b3clf.geometry_opt import geometry_optimize
14
+ from b3clf.utils import get_descriptors, scale_descriptors, select_descriptors
15
+ # from PIL import Image
16
+ from streamlit_extras.let_it_rain import rain
17
+ from streamlit_ketcher import st_ketcher
18
+
19
+ from utils import generate_predictions, load_all_models
20
+
21
+ st.cache_data.clear()
22
+
23
+ st.set_page_config(
24
+ page_title="BBB Permeability Prediction with Imbalanced Learning",
25
+ # page_icon="🧊",
26
+ layout="wide",
27
+ # initial_sidebar_state="expanded",
28
+ # menu_items={
29
+ # "Get Help": "https://www.extremelycoolapp.com/help",
30
+ # "Report a bug": "https://www.extremelycoolapp.com/bug",
31
+ # "About": "# This is a header. This is an *extremely* cool app!"
32
+ # }
33
+ )
34
+
35
+
36
+ keep_features = "no"
37
+ keep_sdf = "no"
38
+ classifiers_dict = {
39
+ "decision tree": "dtree",
40
+ "kNN": "knn",
41
+ "logistic regression": "logreg",
42
+ "XGBoost": "xgb",
43
+ }
44
+ resample_methods_dict = {
45
+ "random undersampling": "classic_RandUndersampling",
46
+ "SMOTE": "classic_SMOTE",
47
+ "Borderline SMOTE": "borderline_SMOTE",
48
+ "k-means SMOTE": "kmeans_SMOTE",
49
+ "ADASYN": "classic_ADASYN",
50
+ "no resampling": "common",
51
+ }
52
+
53
+ pandas_display_options = {
54
+ "line_limit": 50,
55
+ }
56
+ mol_features = None
57
+ info_df = None
58
+ results = None
59
+ temp_file_path = None
60
+ all_models = load_all_models()
61
+
62
+ # Initialize global variables and cleanup function
63
+ if 'temp_dir' not in st.session_state:
64
+ st.session_state.temp_dir = None
65
+ if 'processing' not in st.session_state:
66
+ st.session_state.processing = False
67
+
68
+ def cleanup_temp_files():
69
+ """Clean up temporary directory and files"""
70
+ if st.session_state.temp_dir and os.path.exists(st.session_state.temp_dir):
71
+ try:
72
+ import shutil
73
+ shutil.rmtree(st.session_state.temp_dir)
74
+ st.session_state.temp_dir = None
75
+ except Exception as e:
76
+ st.error(f"Error cleaning up temporary files: {e}")
77
+
78
+ def clear_cache():
79
+ """Clear Streamlit cache and session state data"""
80
+ st.cache_data.clear()
81
+ st.cache_resource.clear()
82
+ if 'mol_features' in st.session_state:
83
+ st.session_state.mol_features = None
84
+ if 'info_df' in st.session_state:
85
+ st.session_state.info_df = None
86
+ cleanup_temp_files()
87
+
88
+ # Create the Streamlit app
89
+ st.title(":blue[BBB Permeability Prediction with Imbalanced Learning]")
90
+ info_column, upload_column = st.columns(2)
91
+
92
+ # inatialize the molecule features and info dataframe session state
93
+ if "mol_features" not in st.session_state:
94
+ st.session_state.mol_features = None
95
+ if "info_df" not in st.session_state:
96
+ st.session_state.info_df = None
97
+ if "classifier" not in st.session_state:
98
+ st.session_state.classifier = "XGBoost"
99
+ if "resampler" not in st.session_state:
100
+ st.session_state.resampler = "ADASYN"
101
+ if "historical_data" not in st.session_state:
102
+ st.session_state.historical_data = []
103
+
104
+ # download sample files
105
+ with info_column:
106
+ st.subheader("About `B3clf`")
107
+ # fmt: off
108
+ st.markdown(
109
+ """
110
+ `B3clf` is a Python package for predicting the blood-brain barrier (BBB) permeability of small molecules using imbalanced learning. It supports decision tree, XGBoost, kNN, logistical regression and 5 resampling strategies (SMOTE, Borderline SMOTE, k-means SMOTE and ADASYN). The workflow of `B3clf` is summarized as below. The Source code and more details are available at https://github.com/theochem/B3clf. This project is supported by Digital Research Alliance of Canada (originally known as Compute Canada) and NSERC. This project is maintained by QC-Dev comminity. For further information and inquiries please contact us at qcdevs@gmail.com."""
111
+ )
112
+ st.text(" \n")
113
+ # text_body = """
114
+ # `B3clf` is a Python package for predicting the blood-brain barrier (BBB) permeability of small molecules using imbalanced learning. It supports decision tree, XGBoost, kNN, logistical regression and 5 resampling strategies (SMOTE, Borderline SMOTE, k-means SMOTE and ADASYN). The workflow of `B3clf` is summarized as below. The Source code and more details are available at https://github.com/theochem/B3clf.
115
+ # """
116
+ # st.markdown(f"<p align="justify">{text_body}</p>",
117
+ # unsafe_allow_html=True)
118
+
119
+ # image = Image.open("images/b3clf_workflow.png")
120
+ # st.image(image=image, use_column_width=True)
121
+
122
+ # image_path = "images/b3clf_workflow.png"
123
+ # image_width_percent = 80
124
+ # info_column.markdown(
125
+ # f"<img src="{image_path}" style="max-width: {image_width_percent}%; height: auto;">",
126
+ # unsafe_allow_html=True
127
+ # )
128
+
129
+ # fmt: on
130
+ sdf_col, smi_col = st.columns(2)
131
+ with sdf_col:
132
+ # uneven columns
133
+ # st.columns((2, 1, 1, 1))
134
+ # two subcolumns for sample input files
135
+ # download sample sdf
136
+ # st.markdown(" \n \n")
137
+ with open("sample_input.sdf", "r") as file_sdf:
138
+ btn = st.download_button(
139
+ label="Download SDF sample file",
140
+ data=file_sdf,
141
+ file_name="sample_input.sdf",
142
+ )
143
+ with smi_col:
144
+ with open("sample_input_smiles.csv", "r") as file_smi:
145
+ btn = st.download_button(
146
+ label="Download SMILES sample file",
147
+ data=file_smi,
148
+ file_name="sample_input_smiles.csv",
149
+ )
150
+
151
+ # Create a file uploader
152
+ with upload_column:
153
+ st.subheader("Model Selection")
154
+ with st.container():
155
+ algorithm_col, resampler_col = st.columns(2)
156
+ # algorithm and resampling method selection column
157
+ with algorithm_col:
158
+ classifier = st.selectbox(
159
+ label="Classification Algorithm:",
160
+ options=("XGBoost", "kNN", "decision tree", "logistic regression"),
161
+ key="classifier",
162
+ help="Select the classification algorithm to use"
163
+ )
164
+ with resampler_col:
165
+ resampler = st.selectbox(
166
+ label="Resampling Method:",
167
+ options=(
168
+ "ADASYN",
169
+ "random undersampling",
170
+ "Borderline SMOTE",
171
+ "k-means SMOTE",
172
+ "SMOTE",
173
+ "no resampling",
174
+ ),
175
+ key="resampler",
176
+ help="Select the resampling method to handle imbalanced data"
177
+ )
178
+
179
+ # Update session state based on selections
180
+ if "classifier" not in st.session_state:
181
+ st.session_state.classifier = classifier
182
+ if "resampler" not in st.session_state:
183
+ st.session_state.resampler = resampler
184
+
185
+ # horizontal line
186
+ st.divider()
187
+ # upload_col, submit_job_col = st.columns((2, 1))
188
+ upload_col, _, submit_job_col, _ = st.columns((4, 0.05, 1, 0.05))
189
+ # upload file column
190
+ with upload_col:
191
+ # session state tracking of the file uploader
192
+ if "uploaded_file" not in st.session_state:
193
+ st.session_state.uploaded_file = None
194
+ if "uploaded_file_changed" not in st.session_state:
195
+ st.session_state.uploaded_file_changed = False
196
+
197
+ # def update_uploader_session_info():
198
+ # """Update the session state of the file uploader."""
199
+ # st.session_state.uploaded_file = uploaded_file
200
+
201
+ uploaded_file = st.file_uploader(
202
+ label="Upload a CSV, SDF, TXT or SMI file",
203
+ type=["csv", "sdf", "txt", "smi"],
204
+ help="Input molecule file only supports *.csv, *.sdf, *.txt and *.smi.",
205
+ accept_multiple_files=False,
206
+ # key="uploaded_file",
207
+ # on_change=update_uploader_session_info,
208
+ )
209
+
210
+ if uploaded_file:
211
+ # st.write(f"the uploaded file: {uploaded_file}")
212
+ # when new file is uploaded is different from the previous one
213
+ if st.session_state.uploaded_file != uploaded_file:
214
+ st.session_state.uploaded_file_changed = True
215
+ else:
216
+ st.session_state.uploaded_file_changed = False
217
+ st.session_state.uploaded_file = uploaded_file
218
+ # when new file is the same as the previous one
219
+ # else:
220
+ # st.session_state.uploaded_file_changed = False
221
+ # st.session_state.uploaded_file = uploaded_file
222
+
223
+ # set session state for the file uploader
224
+ # st.write(f"the state of uploaded file: {st.session_state.uploaded_file}")
225
+ # st.write(f"the state of uploaded file changed: {st.session_state.uploaded_file_changed}")
226
+
227
+ # submit job column
228
+ with submit_job_col:
229
+ st.text(" \n")
230
+ st.text(" \n")
231
+ st.markdown(
232
+ "<div style='display: flex; justify-content: center;'>",
233
+ unsafe_allow_html=True,
234
+ )
235
+ submit_job_button = st.button(
236
+ label="Submit Job",
237
+ type="secondary",
238
+ key="job_button",
239
+ help="Click to start calculations with current configuration"
240
+ )
241
+
242
+ if not submit_job_button:
243
+ if "results" in locals():
244
+ del results
245
+ if "mol_features" in locals():
246
+ del mol_features
247
+ if "info_df" in locals():
248
+ del info_df
249
+
250
+ # Display sections
251
+ feature_column, prediction_column = st.columns(2)
252
+ with feature_column:
253
+ st.subheader("Molecular Features")
254
+ placeholder_features = st.empty()
255
+
256
+ with prediction_column:
257
+ st.subheader("Predictions")
258
+
259
+ # Only process when Submit Job is clicked
260
+ if submit_job_button:
261
+ if not uploaded_file and not st.session_state.mol_features:
262
+ st.warning("Please upload a file first or select data from history to process.")
263
+ else:
264
+ if st.session_state.processing:
265
+ st.warning("A job is already running. Please wait for it to complete.")
266
+ else:
267
+ try:
268
+ st.session_state.processing = True
269
+ with st.spinner('Processing... Please wait.'):
270
+ # Clean up previous files and cache
271
+ cleanup_temp_files()
272
+ clear_cache()
273
+
274
+ # Case 1: New file uploaded
275
+ if uploaded_file:
276
+ # Create new temporary directory
277
+ st.session_state.temp_dir = tempfile.mkdtemp()
278
+ temp_file_path = os.path.join(st.session_state.temp_dir, uploaded_file.name)
279
+
280
+ with open(temp_file_path, "wb") as temp_file:
281
+ temp_file.write(uploaded_file.read())
282
+
283
+ # Store current data in history before processing new data
284
+ if st.session_state.mol_features is not None and st.session_state.info_df is not None:
285
+ st.session_state.historical_data.append({
286
+ 'mol_features': st.session_state.mol_features.copy(),
287
+ 'info_df': st.session_state.info_df.copy()
288
+ })
289
+
290
+ # Clear current data
291
+ st.session_state.mol_features = None
292
+ st.session_state.info_df = None
293
+
294
+ try:
295
+ mol_features, info_df, results = generate_predictions(
296
+ input_fname=temp_file_path,
297
+ sep="\s+|\t+",
298
+ clf=classifiers_dict[st.session_state.classifier],
299
+ _models_dict=all_models,
300
+ sampling=resample_methods_dict[st.session_state.resampler],
301
+ time_per_mol=120,
302
+ mol_features=None,
303
+ info_df=None,
304
+ )
305
+ finally:
306
+ # Clean up temporary files after processing
307
+ cleanup_temp_files()
308
+
309
+ # Case 2: Recalculate with existing data
310
+ else:
311
+ mol_features, info_df, results = generate_predictions(
312
+ input_fname=None,
313
+ sep="\s+|\t+",
314
+ clf=classifiers_dict[st.session_state.classifier],
315
+ _models_dict=all_models,
316
+ sampling=resample_methods_dict[st.session_state.resampler],
317
+ time_per_mol=120,
318
+ mol_features=st.session_state.mol_features,
319
+ info_df=st.session_state.info_df,
320
+ )
321
+
322
+ # Update session state with new results
323
+ if mol_features is not None and info_df is not None:
324
+ st.session_state.mol_features = mol_features
325
+ st.session_state.info_df = info_df
326
+
327
+ except Exception as e:
328
+ st.error(f"Error during processing: {str(e)}")
329
+ finally:
330
+ st.session_state.processing = False
331
+
332
+ # Display results
333
+ # feture table
334
+ with feature_column:
335
+ if st.session_state.mol_features is not None:
336
+ selected_feature_rows = np.min(
337
+ [st.session_state.mol_features.shape[0], pandas_display_options["line_limit"]]
338
+ )
339
+ st.dataframe(st.session_state.mol_features.iloc[:selected_feature_rows, :], hide_index=False)
340
+ # placeholder_features.dataframe(mol_features, hide_index=False)
341
+ feature_file_name = uploaded_file.name.split(".")[0] + "_b3clf_features.csv"
342
+ features_csv = st.session_state.mol_features.to_csv(index=True)
343
+ st.download_button(
344
+ "Download features as CSV",
345
+ data=features_csv,
346
+ file_name=feature_file_name,
347
+ )
348
+ # prediction table
349
+ with prediction_column:
350
+ # st.subheader("Predictions")
351
+ if results is not None:
352
+ # Display the predictions in a table
353
+ selected_result_rows = np.min(
354
+ [results.shape[0], pandas_display_options["line_limit"]]
355
+ )
356
+ results_df_display = results.iloc[:selected_result_rows, :].style.format(
357
+ {"B3clf_predicted_probability": "{:.6f}".format}
358
+ )
359
+ st.dataframe(results_df_display, hide_index=True)
360
+ # Add a button to download the predictions as a CSV file
361
+ predictions_csv = results.to_csv(index=True)
362
+ results_file_name = (
363
+ uploaded_file.name.split(".")[0] + "_b3clf_predictions.csv"
364
+ )
365
+ st.download_button(
366
+ "Download predictions as CSV",
367
+ data=predictions_csv,
368
+ file_name=results_file_name,
369
+ )
370
+ # indicate the success of the job
371
+ # rain(
372
+ # emoji="🎈",
373
+ # font_size=54,
374
+ # falling_speed=5,
375
+ # animation_length=10,
376
+ # )
377
+ st.balloons()
378
+
379
+
380
+ # hide footer
381
+ # https://github.com/streamlit/streamlit/issues/892
382
+ hide_streamlit_style = """
383
+ <style>
384
+ #MainMenu {visibility: hidden;}
385
+ footer {visibility: hidden;}
386
+ </style>
387
+ """
388
+ st.markdown(hide_streamlit_style, unsafe_allow_html=True)
389
+
390
+ # add google analytics
391
+ st.markdown(
392
+ """
393
+ <!-- Google tag (gtag.js) -->
394
+ <script async src="https://www.googletagmanager.com/gtag/js?id=G-WG8QYRELP9"></script>
395
+ <script>
396
+ window.dataLayer = window.dataLayer || [];
397
+ function gtag(){dataLayer.push(arguments);}
398
+ gtag("js", new Date());
399
+
400
+ gtag("config", "G-WG8QYRELP9");
401
+ </script>
402
+ """,
403
+ unsafe_allow_html=True,
404
+ )
packages.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ default-jre
requirements.txt ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ numpy==1.24.4
2
+ scipy==1.10.1
3
+ scikit-learn==0.24.2
4
+ joblib==1.3.2
5
+ pandas==2.0.3
6
+ openpyxl==3.1.2
7
+ xgboost==1.4.2
8
+ padelpy>=0.1.11
9
+ rdkit==2023.03.3
10
+ # streamlit-extra==0.3.4
11
+ git+https://github.com/arnaudmiribel/streamlit-extras@v0.3.4
12
+ # for visualization
13
+ streamlit-ketcher
14
+ # for single molecule
15
+ # py3Dmol==2.0.0.post2
16
+ # stmol==0.0.9
17
+ git+https://github.com/theochem/B3clf.git
sample_input.sdf ADDED
@@ -0,0 +1,387 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ H1_Bepotastine
2
+ RDKit 3D
3
+
4
+ 52 54 0 0 1 0 0 0 0 0999 V2000
5
+ 6.2601 3.8627 -0.7580 Cl 0 0 0 0 0 0 0 0 0 0 0 0
6
+ 0.7350 0.2169 -0.1032 O 0 0 0 0 0 0 0 0 0 0 0 0
7
+ -7.2627 2.0029 -1.7812 O 0 0 0 0 0 0 0 0 0 0 0 0
8
+ -7.8739 -0.0429 -1.1421 O 0 0 0 0 0 0 0 0 0 0 0 0
9
+ -3.2826 0.1387 1.0997 N 0 0 0 0 0 0 0 0 0 0 0 0
10
+ 2.0420 -2.0119 -1.2138 N 0 0 0 0 0 0 0 0 0 0 0 0
11
+ -0.4341 -0.2713 0.5552 C 0 0 0 0 0 0 0 0 0 0 0 0
12
+ -1.5088 -0.5144 -0.4974 C 0 0 0 0 0 0 0 0 0 0 0 0
13
+ -0.9255 0.7694 1.5572 C 0 0 0 0 0 0 0 0 0 0 0 0
14
+ -2.8345 -0.8975 0.1550 C 0 0 0 0 0 0 0 0 0 0 0 0
15
+ -2.2740 0.3674 2.1479 C 0 0 0 0 0 0 0 0 0 0 0 0
16
+ -4.5811 -0.1850 1.7144 C 0 0 0 0 0 0 0 0 0 0 0 0
17
+ -5.7574 -0.2607 0.7330 C 0 0 0 0 0 0 0 0 0 0 0 0
18
+ 1.9672 -0.2099 0.5040 C 0 0 2 0 0 0 0 0 0 0 0 0
19
+ -5.9298 1.0111 -0.0974 C 0 0 0 0 0 0 0 0 0 0 0 0
20
+ 3.0410 0.8232 0.1855 C 0 0 0 0 0 0 0 0 0 0 0 0
21
+ 2.3687 -1.6155 0.0463 C 0 0 0 0 0 0 0 0 0 0 0 0
22
+ 3.9935 1.1819 1.1545 C 0 0 0 0 0 0 0 0 0 0 0 0
23
+ 3.1185 1.4155 -1.0867 C 0 0 0 0 0 0 0 0 0 0 0 0
24
+ -7.1061 0.8976 -1.0266 C 0 0 0 0 0 0 0 0 0 0 0 0
25
+ 3.0746 -2.4482 0.9176 C 0 0 0 0 0 0 0 0 0 0 0 0
26
+ 4.9873 2.1194 0.8610 C 0 0 0 0 0 0 0 0 0 0 0 0
27
+ 4.1084 2.3564 -1.3784 C 0 0 0 0 0 0 0 0 0 0 0 0
28
+ 3.4496 -3.7187 0.4871 C 0 0 0 0 0 0 0 0 0 0 0 0
29
+ 5.0380 2.7045 -0.4026 C 0 0 0 0 0 0 0 0 0 0 0 0
30
+ 2.4252 -3.2455 -1.6060 C 0 0 0 0 0 0 0 0 0 0 0 0
31
+ 3.1214 -4.1271 -0.7990 C 0 0 0 0 0 0 0 0 0 0 0 0
32
+ -0.2263 -1.2199 1.0679 H 0 0 0 0 0 0 0 0 0 0 0 0
33
+ -1.6364 0.3807 -1.1209 H 0 0 0 0 0 0 0 0 0 0 0 0
34
+ -1.1831 -1.3082 -1.1808 H 0 0 0 0 0 0 0 0 0 0 0 0
35
+ -0.1894 0.8975 2.3595 H 0 0 0 0 0 0 0 0 0 0 0 0
36
+ -1.0042 1.7496 1.0680 H 0 0 0 0 0 0 0 0 0 0 0 0
37
+ -3.5642 -1.0250 -0.6514 H 0 0 0 0 0 0 0 0 0 0 0 0
38
+ -2.7343 -1.8665 0.6611 H 0 0 0 0 0 0 0 0 0 0 0 0
39
+ -2.1498 -0.5299 2.7684 H 0 0 0 0 0 0 0 0 0 0 0 0
40
+ -2.6054 1.1766 2.8103 H 0 0 0 0 0 0 0 0 0 0 0 0
41
+ -4.5185 -1.1314 2.2673 H 0 0 0 0 0 0 0 0 0 0 0 0
42
+ -4.8272 0.5917 2.4507 H 0 0 0 0 0 0 0 0 0 0 0 0
43
+ -5.6514 -1.1306 0.0739 H 0 0 0 0 0 0 0 0 0 0 0 0
44
+ -6.6737 -0.4399 1.3108 H 0 0 0 0 0 0 0 0 0 0 0 0
45
+ 1.8204 -0.2159 1.5927 H 0 0 0 0 0 0 0 0 0 0 0 0
46
+ -6.0945 1.8686 0.5639 H 0 0 0 0 0 0 0 0 0 0 0 0
47
+ -5.0396 1.1941 -0.7083 H 0 0 0 0 0 0 0 0 0 0 0 0
48
+ 3.9687 0.7355 2.1458 H 0 0 0 0 0 0 0 0 0 0 0 0
49
+ 2.3964 1.1402 -1.8552 H 0 0 0 0 0 0 0 0 0 0 0 0
50
+ 3.3355 -2.1177 1.9176 H 0 0 0 0 0 0 0 0 0 0 0 0
51
+ 5.7167 2.3889 1.6199 H 0 0 0 0 0 0 0 0 0 0 0 0
52
+ 4.1451 2.8085 -2.3655 H 0 0 0 0 0 0 0 0 0 0 0 0
53
+ 3.9993 -4.3824 1.1485 H 0 0 0 0 0 0 0 0 0 0 0 0
54
+ 2.1492 -3.5132 -2.6219 H 0 0 0 0 0 0 0 0 0 0 0 0
55
+ 3.4047 -5.1069 -1.1664 H 0 0 0 0 0 0 0 0 0 0 0 0
56
+ -8.0410 1.8004 -2.3409 H 0 0 0 0 0 0 0 0 0 0 0 0
57
+ 1 25 1 0
58
+ 2 7 1 0
59
+ 2 14 1 0
60
+ 3 20 1 0
61
+ 3 52 1 0
62
+ 4 20 2 0
63
+ 5 10 1 0
64
+ 5 11 1 0
65
+ 5 12 1 0
66
+ 6 17 2 0
67
+ 6 26 1 0
68
+ 7 8 1 0
69
+ 7 9 1 0
70
+ 7 28 1 0
71
+ 8 10 1 0
72
+ 8 29 1 0
73
+ 8 30 1 0
74
+ 9 11 1 0
75
+ 9 31 1 0
76
+ 9 32 1 0
77
+ 10 33 1 0
78
+ 10 34 1 0
79
+ 11 35 1 0
80
+ 11 36 1 0
81
+ 12 13 1 0
82
+ 12 37 1 0
83
+ 12 38 1 0
84
+ 13 15 1 0
85
+ 13 39 1 0
86
+ 13 40 1 0
87
+ 14 16 1 0
88
+ 14 17 1 0
89
+ 14 41 1 1
90
+ 15 20 1 0
91
+ 15 42 1 0
92
+ 15 43 1 0
93
+ 16 18 2 0
94
+ 16 19 1 0
95
+ 17 21 1 0
96
+ 18 22 1 0
97
+ 18 44 1 0
98
+ 19 23 2 0
99
+ 19 45 1 0
100
+ 21 24 2 0
101
+ 21 46 1 0
102
+ 22 25 2 0
103
+ 22 47 1 0
104
+ 23 25 1 0
105
+ 23 48 1 0
106
+ 24 27 1 0
107
+ 24 49 1 0
108
+ 26 27 2 0
109
+ 26 50 1 0
110
+ 27 51 1 0
111
+ M END
112
+ > <compoud_name> (1)
113
+ H1_Bepotastine
114
+
115
+ > <SMILES> (1)
116
+ [H]OC(=O)C([H])([H])C([H])([H])C([H])([H])N1C([H])([H])C([H])([H])C([H])(OC([H])(c2nc([H])c([H])c([H])c2[H])c2c([H])c([H])c(Cl)c([H])c2[H])C([H])([H])C1([H])[H]
117
+
118
+ > <cid> (1)
119
+ 2350
120
+
121
+ > <category> (1)
122
+ N
123
+
124
+ > <inchi> (1)
125
+ InChI=1S/C21H25ClN2O3/c22-17-8-6-16(7-9-17)21(19-4-1-2-12-23-19)27-18-10-14-24(15-11-18)13-3-5-20(25)26/h1-2,4,6-9,12,18,21H,3,5,10-11,13-15H2,(H,25,26)/t21-/m1/s1
126
+
127
+ > <Energy> (1)
128
+ 49.1758
129
+
130
+ $$$$
131
+ H1_Quifenadine
132
+ RDKit 3D
133
+
134
+ 45 48 0 0 1 0 0 0 0 0999 V2000
135
+ 0.1106 0.2102 -1.7897 O 0 0 0 0 0 0 0 0 0 0 0 0
136
+ 3.4646 1.0770 -0.0854 N 0 0 0 0 0 0 0 0 0 0 0 0
137
+ 2.0931 -1.1209 0.1252 C 0 0 0 0 0 0 0 0 0 0 0 0
138
+ 1.1729 0.1166 0.3820 C 0 0 1 0 0 0 0 0 0 0 0 0
139
+ 2.0299 1.3864 0.1159 C 0 0 0 0 0 0 0 0 0 0 0 0
140
+ 2.7971 -1.0339 -1.2379 C 0 0 0 0 0 0 0 0 0 0 0 0
141
+ 3.2148 -1.0584 1.1848 C 0 0 0 0 0 0 0 0 0 0 0 0
142
+ 3.5902 0.2772 -1.3240 C 0 0 0 0 0 0 0 0 0 0 0 0
143
+ 3.9592 0.2796 1.0561 C 0 0 0 0 0 0 0 0 0 0 0 0
144
+ -0.2029 0.1255 -0.3860 C 0 0 0 0 0 0 0 0 0 0 0 0
145
+ -1.1272 1.3230 -0.0602 C 0 0 0 0 0 0 0 0 0 0 0 0
146
+ -0.9736 -1.1857 -0.1269 C 0 0 0 0 0 0 0 0 0 0 0 0
147
+ -1.0387 2.0636 1.1310 C 0 0 0 0 0 0 0 0 0 0 0 0
148
+ -1.3454 -2.0428 -1.1782 C 0 0 0 0 0 0 0 0 0 0 0 0
149
+ -2.1533 1.6708 -0.9653 C 0 0 0 0 0 0 0 0 0 0 0 0
150
+ -1.3459 -1.5543 1.1811 C 0 0 0 0 0 0 0 0 0 0 0 0
151
+ -1.9065 3.1310 1.3840 C 0 0 0 0 0 0 0 0 0 0 0 0
152
+ -2.0526 -3.2227 -0.9327 C 0 0 0 0 0 0 0 0 0 0 0 0
153
+ -3.0179 2.7377 -0.7134 C 0 0 0 0 0 0 0 0 0 0 0 0
154
+ -2.0493 -2.7364 1.4259 C 0 0 0 0 0 0 0 0 0 0 0 0
155
+ -2.8897 3.4721 0.4604 C 0 0 0 0 0 0 0 0 0 0 0 0
156
+ -2.4022 -3.5700 0.3691 C 0 0 0 0 0 0 0 0 0 0 0 0
157
+ 1.5541 -2.0675 0.2237 H 0 0 0 0 0 0 0 0 0 0 0 0
158
+ 0.9532 0.0967 1.4588 H 0 0 0 0 0 0 0 0 0 0 0 0
159
+ 1.6691 1.9630 -0.7430 H 0 0 0 0 0 0 0 0 0 0 0 0
160
+ 1.9423 2.0685 0.9712 H 0 0 0 0 0 0 0 0 0 0 0 0
161
+ 2.0851 -1.1104 -2.0638 H 0 0 0 0 0 0 0 0 0 0 0 0
162
+ 3.4846 -1.8820 -1.3506 H 0 0 0 0 0 0 0 0 0 0 0 0
163
+ 3.9137 -1.8918 1.0436 H 0 0 0 0 0 0 0 0 0 0 0 0
164
+ 2.7942 -1.1596 2.1923 H 0 0 0 0 0 0 0 0 0 0 0 0
165
+ 4.6485 0.0638 -1.5199 H 0 0 0 0 0 0 0 0 0 0 0 0
166
+ 3.2467 0.8670 -2.1831 H 0 0 0 0 0 0 0 0 0 0 0 0
167
+ 3.8541 0.8576 1.9828 H 0 0 0 0 0 0 0 0 0 0 0 0
168
+ 5.0353 0.0986 0.9430 H 0 0 0 0 0 0 0 0 0 0 0 0
169
+ 0.1304 1.1516 -2.0295 H 0 0 0 0 0 0 0 0 0 0 0 0
170
+ -0.3059 1.8245 1.8958 H 0 0 0 0 0 0 0 0 0 0 0 0
171
+ -1.0856 -1.7976 -2.2061 H 0 0 0 0 0 0 0 0 0 0 0 0
172
+ -2.2926 1.0941 -1.8795 H 0 0 0 0 0 0 0 0 0 0 0 0
173
+ -1.0974 -0.9178 2.0267 H 0 0 0 0 0 0 0 0 0 0 0 0
174
+ -1.8179 3.6927 2.3110 H 0 0 0 0 0 0 0 0 0 0 0 0
175
+ -2.3308 -3.8683 -1.7614 H 0 0 0 0 0 0 0 0 0 0 0 0
176
+ -3.7962 2.9864 -1.4300 H 0 0 0 0 0 0 0 0 0 0 0 0
177
+ -2.3260 -3.0022 2.4429 H 0 0 0 0 0 0 0 0 0 0 0 0
178
+ -3.5643 4.2999 0.6616 H 0 0 0 0 0 0 0 0 0 0 0 0
179
+ -2.9530 -4.4872 0.5586 H 0 0 0 0 0 0 0 0 0 0 0 0
180
+ 1 10 1 0
181
+ 1 35 1 0
182
+ 2 5 1 0
183
+ 2 8 1 0
184
+ 2 9 1 0
185
+ 3 4 1 0
186
+ 3 6 1 0
187
+ 3 7 1 0
188
+ 3 23 1 0
189
+ 4 5 1 0
190
+ 4 10 1 0
191
+ 4 24 1 1
192
+ 5 25 1 0
193
+ 5 26 1 0
194
+ 6 8 1 0
195
+ 6 27 1 0
196
+ 6 28 1 0
197
+ 7 9 1 0
198
+ 7 29 1 0
199
+ 7 30 1 0
200
+ 8 31 1 0
201
+ 8 32 1 0
202
+ 9 33 1 0
203
+ 9 34 1 0
204
+ 10 11 1 0
205
+ 10 12 1 0
206
+ 11 13 2 0
207
+ 11 15 1 0
208
+ 12 14 2 0
209
+ 12 16 1 0
210
+ 13 17 1 0
211
+ 13 36 1 0
212
+ 14 18 1 0
213
+ 14 37 1 0
214
+ 15 19 2 0
215
+ 15 38 1 0
216
+ 16 20 2 0
217
+ 16 39 1 0
218
+ 17 21 2 0
219
+ 17 40 1 0
220
+ 18 22 2 0
221
+ 18 41 1 0
222
+ 19 21 1 0
223
+ 19 42 1 0
224
+ 20 22 1 0
225
+ 20 43 1 0
226
+ 21 44 1 0
227
+ 22 45 1 0
228
+ M END
229
+ > <compoud_name> (2)
230
+ H1_Quifenadine
231
+
232
+ > <SMILES> (2)
233
+ [H]OC(c1c([H])c([H])c([H])c([H])c1[H])(c1c([H])c([H])c([H])c([H])c1[H])C1([H])C([H])([H])N2C([H])([H])C([H])([H])C1([H])C([H])([H])C2([H])[H]
234
+
235
+ > <cid> (2)
236
+ 65600
237
+
238
+ > <category> (2)
239
+ N
240
+
241
+ > <inchi> (2)
242
+ InChI=1S/C20H23NO/c22-20(17-7-3-1-4-8-17,18-9-5-2-6-10-18)19-15-21-13-11-16(19)12-14-21/h1-10,16,19,22H,11-15H2/t19-/m1/s1
243
+
244
+ > <Energy> (2)
245
+ 84.891
246
+
247
+ $$$$
248
+ H1_Rupatadine
249
+ RDKit 3D
250
+
251
+ 56 60 0 0 0 0 0 0 0 0999 V2000
252
+ 6.5298 3.3080 0.0562 Cl 0 0 0 0 0 0 0 0 0 0 0 0
253
+ -2.1780 1.1440 -0.1081 N 0 0 0 0 0 0 0 0 0 0 0 0
254
+ 1.8055 -2.5028 1.6263 N 0 0 0 0 0 0 0 0 0 0 0 0
255
+ -6.5347 -0.2932 -1.5666 N 0 0 0 0 0 0 0 0 0 0 0 0
256
+ 0.4984 0.2017 0.7391 C 0 0 0 0 0 0 0 0 0 0 0 0
257
+ -0.7596 -0.6401 0.9176 C 0 0 0 0 0 0 0 0 0 0 0 0
258
+ 0.1325 1.6779 0.6992 C 0 0 0 0 0 0 0 0 0 0 0 0
259
+ -1.8276 -0.2907 -0.1321 C 0 0 0 0 0 0 0 0 0 0 0 0
260
+ -0.9697 1.9571 -0.3378 C 0 0 0 0 0 0 0 0 0 0 0 0
261
+ 1.7535 -0.3064 0.5966 C 0 0 0 0 0 0 0 0 0 0 0 0
262
+ -3.2065 1.4670 -1.1132 C 0 0 0 0 0 0 0 0 0 0 0 0
263
+ 2.9347 0.5760 0.4016 C 0 0 0 0 0 0 0 0 0 0 0 0
264
+ 1.9383 -1.7730 0.4937 C 0 0 0 0 0 0 0 0 0 0 0 0
265
+ 3.7669 0.4917 -0.7359 C 0 0 0 0 0 0 0 0 0 0 0 0
266
+ 3.6248 -0.5108 -1.8705 C 0 0 0 0 0 0 0 0 0 0 0 0
267
+ 2.3939 -1.4219 -1.9523 C 0 0 0 0 0 0 0 0 0 0 0 0
268
+ 2.2514 -2.3194 -0.7533 C 0 0 0 0 0 0 0 0 0 0 0 0
269
+ -4.5656 0.8945 -0.7963 C 0 0 0 0 0 0 0 0 0 0 0 0
270
+ 3.2715 1.4705 1.4385 C 0 0 0 0 0 0 0 0 0 0 0 0
271
+ 4.8769 1.3617 -0.8210 C 0 0 0 0 0 0 0 0 0 0 0 0
272
+ 2.4290 -3.7014 -0.8308 C 0 0 0 0 0 0 0 0 0 0 0 0
273
+ 4.3729 2.3200 1.3344 C 0 0 0 0 0 0 0 0 0 0 0 0
274
+ 5.1670 2.2679 0.1982 C 0 0 0 0 0 0 0 0 0 0 0 0
275
+ -5.1566 1.0467 0.4633 C 0 0 0 0 0 0 0 0 0 0 0 0
276
+ -5.3042 0.2290 -1.7686 C 0 0 0 0 0 0 0 0 0 0 0 0
277
+ 2.2947 -4.4730 0.3198 C 0 0 0 0 0 0 0 0 0 0 0 0
278
+ 1.9875 -3.8347 1.5112 C 0 0 0 0 0 0 0 0 0 0 0 0
279
+ -6.4311 0.5316 0.7094 C 0 0 0 0 0 0 0 0 0 0 0 0
280
+ -7.0633 -0.1364 -0.3325 C 0 0 0 0 0 0 0 0 0 0 0 0
281
+ -7.0626 0.6338 2.0605 C 0 0 0 0 0 0 0 0 0 0 0 0
282
+ -0.5731 -1.7154 0.8560 H 0 0 0 0 0 0 0 0 0 0 0 0
283
+ -1.1596 -0.4557 1.9235 H 0 0 0 0 0 0 0 0 0 0 0 0
284
+ -0.2119 1.9818 1.6961 H 0 0 0 0 0 0 0 0 0 0 0 0
285
+ 0.9793 2.3217 0.4489 H 0 0 0 0 0 0 0 0 0 0 0 0
286
+ -1.4699 -0.5848 -1.1284 H 0 0 0 0 0 0 0 0 0 0 0 0
287
+ -2.7127 -0.8992 0.0866 H 0 0 0 0 0 0 0 0 0 0 0 0
288
+ -1.2287 3.0211 -0.2712 H 0 0 0 0 0 0 0 0 0 0 0 0
289
+ -0.5727 1.7824 -1.3473 H 0 0 0 0 0 0 0 0 0 0 0 0
290
+ -2.8776 1.1445 -2.1102 H 0 0 0 0 0 0 0 0 0 0 0 0
291
+ -3.3405 2.5558 -1.1674 H 0 0 0 0 0 0 0 0 0 0 0 0
292
+ 3.6660 0.0536 -2.8120 H 0 0 0 0 0 0 0 0 0 0 0 0
293
+ 4.5182 -1.1506 -1.8447 H 0 0 0 0 0 0 0 0 0 0 0 0
294
+ 2.4771 -2.0361 -2.8582 H 0 0 0 0 0 0 0 0 0 0 0 0
295
+ 1.4795 -0.8292 -2.0837 H 0 0 0 0 0 0 0 0 0 0 0 0
296
+ 2.6674 1.5029 2.3444 H 0 0 0 0 0 0 0 0 0 0 0 0
297
+ 5.5326 1.3154 -1.6888 H 0 0 0 0 0 0 0 0 0 0 0 0
298
+ 2.6741 -4.1805 -1.7747 H 0 0 0 0 0 0 0 0 0 0 0 0
299
+ 4.6043 3.0064 2.1437 H 0 0 0 0 0 0 0 0 0 0 0 0
300
+ -4.6110 1.5606 1.2526 H 0 0 0 0 0 0 0 0 0 0 0 0
301
+ -4.9162 0.0859 -2.7735 H 0 0 0 0 0 0 0 0 0 0 0 0
302
+ 2.4295 -5.5486 0.2902 H 0 0 0 0 0 0 0 0 0 0 0 0
303
+ 1.8762 -4.3969 2.4339 H 0 0 0 0 0 0 0 0 0 0 0 0
304
+ -8.0471 -0.5796 -0.2022 H 0 0 0 0 0 0 0 0 0 0 0 0
305
+ -8.1536 0.6818 1.9793 H 0 0 0 0 0 0 0 0 0 0 0 0
306
+ -6.7913 -0.2348 2.6683 H 0 0 0 0 0 0 0 0 0 0 0 0
307
+ -6.7355 1.5422 2.5773 H 0 0 0 0 0 0 0 0 0 0 0 0
308
+ 1 23 1 0
309
+ 2 8 1 0
310
+ 2 9 1 0
311
+ 2 11 1 0
312
+ 3 13 2 0
313
+ 3 27 1 0
314
+ 4 25 2 0
315
+ 4 29 1 0
316
+ 5 6 1 0
317
+ 5 7 1 0
318
+ 5 10 2 3
319
+ 6 8 1 0
320
+ 6 31 1 0
321
+ 6 32 1 0
322
+ 7 9 1 0
323
+ 7 33 1 0
324
+ 7 34 1 0
325
+ 8 35 1 0
326
+ 8 36 1 0
327
+ 9 37 1 0
328
+ 9 38 1 0
329
+ 10 12 1 0
330
+ 10 13 1 0
331
+ 11 18 1 0
332
+ 11 39 1 0
333
+ 11 40 1 0
334
+ 12 14 2 0
335
+ 12 19 1 0
336
+ 13 17 1 0
337
+ 14 15 1 0
338
+ 14 20 1 0
339
+ 15 16 1 0
340
+ 15 41 1 0
341
+ 15 42 1 0
342
+ 16 17 1 0
343
+ 16 43 1 0
344
+ 16 44 1 0
345
+ 17 21 2 0
346
+ 18 24 2 0
347
+ 18 25 1 0
348
+ 19 22 2 0
349
+ 19 45 1 0
350
+ 20 23 2 0
351
+ 20 46 1 0
352
+ 21 26 1 0
353
+ 21 47 1 0
354
+ 22 23 1 0
355
+ 22 48 1 0
356
+ 24 28 1 0
357
+ 24 49 1 0
358
+ 25 50 1 0
359
+ 26 27 2 0
360
+ 26 51 1 0
361
+ 27 52 1 0
362
+ 28 29 2 0
363
+ 28 30 1 0
364
+ 29 53 1 0
365
+ 30 54 1 0
366
+ 30 55 1 0
367
+ 30 56 1 0
368
+ M END
369
+ > <compoud_name> (3)
370
+ H1_Rupatadine
371
+
372
+ > <SMILES> (3)
373
+ [H]c1nc2c(c([H])c1[H])C([H])([H])C([H])([H])c1c([H])c(Cl)c([H])c([H])c1C2=C1C([H])([H])C([H])([H])N(C([H])([H])c2c([H])nc([H])c(C([H])([H])[H])c2[H])C([H])([H])C1([H])[H]
374
+
375
+ > <cid> (3)
376
+ 133017
377
+
378
+ > <category> (3)
379
+ N
380
+
381
+ > <inchi> (3)
382
+ InChI=1S/C26H26ClN3/c1-18-13-19(16-28-15-18)17-30-11-8-20(9-12-30)25-24-7-6-23(27)14-22(24)5-4-21-3-2-10-29-26(21)25/h2-3,6-7,10,13-16H,4-5,8-9,11-12,17H2,1H3
383
+
384
+ > <Energy> (3)
385
+ 119.976
386
+
387
+ $$$$
sample_input_smiles.csv ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ OC(=O)CCCN1CCC(OC(c2ncccc2)c2ccc(Cl)cc2)CC1
2
+ OC(c1ccccc1)(c1ccccc1)C1CN2CCC1CC2
3
+ c1nc2c(cc1)CCc1cc(Cl)ccc1C2=C1CCN(Cc2cncc(C)c2)CC1
4
+ C1=CC=C2C(=C1)C=CC3=CC=CC=C3N2C(=O)N
5
+ CC(=O)Oc1ccccc1C(=O)O
6
+ CC(=O)Oc1c(cc(cc1)Cl)C(=O)OC(=O)c1c(ccc(c1)Cl)OC(=O)C
utils.py ADDED
@@ -0,0 +1,172 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import itertools as it
2
+ import os
3
+
4
+ import joblib
5
+ import numpy as np
6
+ import pandas as pd
7
+ import pkg_resources
8
+ import streamlit as st
9
+ from b3clf.descriptor_padel import compute_descriptors
10
+ from b3clf.geometry_opt import geometry_optimize
11
+ from b3clf.utils import get_descriptors, scale_descriptors, select_descriptors
12
+
13
+
14
+ @st.cache_resource()
15
+ def load_all_models():
16
+ """Get b3clf fitted classifier"""
17
+ clf_list = ["dtree", "knn", "logreg", "xgb"]
18
+ sampling_list = [
19
+ "borderline_SMOTE",
20
+ "classic_ADASYN",
21
+ "classic_RandUndersampling",
22
+ "classic_SMOTE",
23
+ "kmeans_SMOTE",
24
+ "common",
25
+ ]
26
+
27
+ model_dict = {}
28
+ package_name = "b3clf"
29
+
30
+ for clf_str, sampling_str in it.product(clf_list, sampling_list):
31
+ # joblib_fpath = os.path.join(
32
+ # dirname, "pre_trained", "b3clf_{}_{}.joblib".format(clf_str, sampling_str))
33
+ # pred_model = joblib.load(joblib_fpath)
34
+ joblib_path_str = f"pre_trained/b3clf_{clf_str}_{sampling_str}.joblib"
35
+ with pkg_resources.resource_stream(package_name, joblib_path_str) as f:
36
+ pred_model = joblib.load(f)
37
+
38
+ model_dict[clf_str + "_" + sampling_str] = pred_model
39
+
40
+ return model_dict
41
+
42
+
43
+ @st.cache_resource
44
+ def predict_permeability(
45
+ clf_str, sampling_str, _models_dict, mol_features, info_df, threshold="none"
46
+ ):
47
+ """Compute permeability prediction for given feature data."""
48
+ # load the model
49
+ # pred_model = load_all_models()[clf_str + "_" + sampling_str]
50
+ pred_model = _models_dict[clf_str + "_" + sampling_str]
51
+
52
+ # load the threshold data
53
+ package_name = "b3clf"
54
+ with pkg_resources.resource_stream(package_name, "data/B3clf_thresholds.xlsx") as f:
55
+ df_thres = pd.read_excel(f, index_col=0, engine="openpyxl")
56
+
57
+ # default threshold is 0.5
58
+ label_pool = np.zeros(mol_features.shape[0], dtype=int)
59
+
60
+ if type(mol_features) == pd.DataFrame:
61
+ if mol_features.index.tolist() != info_df.index.tolist():
62
+ raise ValueError("Features_df and Info_df do not have the same index.")
63
+
64
+ # get predicted probabilities
65
+ info_df.loc[:, "B3clf_predicted_probability"] = pred_model.predict_proba(
66
+ mol_features
67
+ )[:, 1]
68
+ # get predicted label from probability using the threshold
69
+ mask = np.greater_equal(
70
+ info_df["B3clf_predicted_probability"].to_numpy(),
71
+ # df_thres.loc[clf_str + "-" + sampling_str, threshold])
72
+ df_thres.loc["xgb-classic_ADASYN", threshold],
73
+ )
74
+ label_pool[mask] = 1
75
+
76
+ # save the predicted labels
77
+ info_df["B3clf_predicted_label"] = label_pool
78
+ info_df.reset_index(inplace=True)
79
+
80
+ return info_df
81
+
82
+
83
+ @st.cache_resource
84
+ def generate_predictions(
85
+ input_fname: str = None,
86
+ sep: str = "\s+|\t+",
87
+ clf: str = "xgb",
88
+ _models_dict: dict = None,
89
+ keep_sdf: str = "no",
90
+ sampling: str = "classic_ADASYN",
91
+ time_per_mol: int = 120,
92
+ mol_features: pd.DataFrame = None,
93
+ info_df: pd.DataFrame = None,
94
+ ):
95
+ """
96
+ Generate predictions for a given input file.
97
+ """
98
+ try:
99
+ if mol_features is None and info_df is None:
100
+ if input_fname is None:
101
+ raise ValueError("Either input_fname or mol_features/info_df must be provided")
102
+
103
+ mol_tag = os.path.basename(input_fname).split(".")[0]
104
+ file_ext = os.path.splitext(input_fname)[1].lower()
105
+ internal_sdf = f"{mol_tag}_optimized_3d.sdf"
106
+
107
+ try:
108
+ # Handle different file types
109
+ if file_ext == '.csv':
110
+ sep = ','
111
+ elif file_ext == '.txt' or file_ext == '.smi':
112
+ sep = '\s+|\t+'
113
+ elif file_ext != '.sdf':
114
+ raise ValueError(f"Unsupported file type: {file_ext}")
115
+
116
+ # Geometry optimization
117
+ geometry_optimize(input_fname=input_fname, output_sdf=internal_sdf, sep=sep)
118
+
119
+ # Compute descriptors with timeout handling
120
+ df_features = compute_descriptors(
121
+ sdf_file=internal_sdf,
122
+ excel_out=None,
123
+ output_csv=None,
124
+ timeout=time_per_mol * 2, # Double the per-molecule time for total timeout
125
+ time_per_molecule=time_per_mol,
126
+ )
127
+
128
+ # Get computed descriptors
129
+ mol_features, info_df = get_descriptors(df=df_features)
130
+
131
+ # Select descriptors
132
+ mol_features = select_descriptors(df=mol_features)
133
+
134
+ # Scale descriptors
135
+ mol_features.iloc[:, :] = scale_descriptors(df=mol_features)
136
+
137
+ finally:
138
+ # Clean up temporary files
139
+ if os.path.exists(internal_sdf) and keep_sdf == "no":
140
+ try:
141
+ os.remove(internal_sdf)
142
+ except:
143
+ pass
144
+
145
+ # Get predictions
146
+ result_df = predict_permeability(
147
+ clf_str=clf,
148
+ sampling_str=sampling,
149
+ _models_dict=_models_dict,
150
+ mol_features=mol_features,
151
+ info_df=info_df,
152
+ threshold="none",
153
+ )
154
+
155
+ # Select display columns
156
+ display_cols = [
157
+ "ID",
158
+ "SMILES",
159
+ "B3clf_predicted_probability",
160
+ "B3clf_predicted_label",
161
+ ]
162
+
163
+ result_df = result_df[
164
+ [col for col in result_df.columns.to_list() if col in display_cols]
165
+ ]
166
+
167
+ return mol_features, info_df, result_df
168
+
169
+ except Exception as e:
170
+ import traceback
171
+ st.error(f"Error in generate_predictions: {str(e)}\n{traceback.format_exc()}")
172
+ raise