Commit
·
0e95800
1
Parent(s):
d05f89f
Synced repo using 'sync_with_huggingface' Github Action
Browse files- b3clf/b3clf.py +43 -28
- b3clf/utils.py +14 -9
b3clf/b3clf.py
CHANGED
|
@@ -31,26 +31,31 @@ import os
|
|
| 31 |
import numpy as np
|
| 32 |
from .descriptor_padel import compute_descriptors
|
| 33 |
from .geometry_opt import geometry_optimize
|
| 34 |
-
from .utils import (
|
| 35 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
|
| 37 |
__all__ = [
|
| 38 |
"b3clf",
|
| 39 |
]
|
| 40 |
|
| 41 |
|
| 42 |
-
def b3clf(
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
|
|
|
| 54 |
"""Use B3clf for BBB classifications with resampling strategies.
|
| 55 |
|
| 56 |
Parameters
|
|
@@ -110,12 +115,13 @@ def b3clf(mol_in,
|
|
| 110 |
|
| 111 |
geometry_optimize(input_fname=mol_in, output_sdf=internal_sdf, sep=sep)
|
| 112 |
|
| 113 |
-
_ = compute_descriptors(
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
|
|
|
| 119 |
|
| 120 |
# Get computed descriptors
|
| 121 |
X_features, info_df = get_descriptors(df=features_out)
|
|
@@ -131,16 +137,25 @@ def b3clf(mol_in,
|
|
| 131 |
# clf = get_clf(clf_str=clf, sampling_str=sampling)
|
| 132 |
|
| 133 |
# Get classifier
|
| 134 |
-
result_df = predict_permeability(
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
|
|
|
|
|
|
| 139 |
|
| 140 |
# Get classifier
|
| 141 |
-
display_cols = [
|
| 142 |
-
|
| 143 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 144 |
if verbose != 0:
|
| 145 |
print(result_df)
|
| 146 |
|
|
|
|
| 31 |
import numpy as np
|
| 32 |
from .descriptor_padel import compute_descriptors
|
| 33 |
from .geometry_opt import geometry_optimize
|
| 34 |
+
from .utils import (
|
| 35 |
+
get_descriptors,
|
| 36 |
+
predict_permeability,
|
| 37 |
+
scale_descriptors,
|
| 38 |
+
select_descriptors,
|
| 39 |
+
)
|
| 40 |
|
| 41 |
__all__ = [
|
| 42 |
"b3clf",
|
| 43 |
]
|
| 44 |
|
| 45 |
|
| 46 |
+
def b3clf(
|
| 47 |
+
mol_in,
|
| 48 |
+
sep="\s+|\t+",
|
| 49 |
+
clf="xgb",
|
| 50 |
+
sampling="classic_ADASYN",
|
| 51 |
+
output="B3clf_output.xlsx",
|
| 52 |
+
verbose=1,
|
| 53 |
+
random_seed=42,
|
| 54 |
+
time_per_mol=-1,
|
| 55 |
+
keep_features="no",
|
| 56 |
+
keep_sdf="no",
|
| 57 |
+
threshold="none",
|
| 58 |
+
):
|
| 59 |
"""Use B3clf for BBB classifications with resampling strategies.
|
| 60 |
|
| 61 |
Parameters
|
|
|
|
| 115 |
|
| 116 |
geometry_optimize(input_fname=mol_in, output_sdf=internal_sdf, sep=sep)
|
| 117 |
|
| 118 |
+
_ = compute_descriptors(
|
| 119 |
+
sdf_file=internal_sdf,
|
| 120 |
+
excel_out=features_out,
|
| 121 |
+
output_csv=None,
|
| 122 |
+
timeout=None,
|
| 123 |
+
time_per_molecule=time_per_mol,
|
| 124 |
+
)
|
| 125 |
|
| 126 |
# Get computed descriptors
|
| 127 |
X_features, info_df = get_descriptors(df=features_out)
|
|
|
|
| 137 |
# clf = get_clf(clf_str=clf, sampling_str=sampling)
|
| 138 |
|
| 139 |
# Get classifier
|
| 140 |
+
result_df = predict_permeability(
|
| 141 |
+
clf_str=clf,
|
| 142 |
+
sampling_str=sampling,
|
| 143 |
+
mol_features=X_features,
|
| 144 |
+
info_df=info_df,
|
| 145 |
+
threshold=threshold,
|
| 146 |
+
)
|
| 147 |
|
| 148 |
# Get classifier
|
| 149 |
+
display_cols = [
|
| 150 |
+
"ID",
|
| 151 |
+
"SMILES",
|
| 152 |
+
"B3clf_predicted_probability",
|
| 153 |
+
"B3clf_predicted_label",
|
| 154 |
+
]
|
| 155 |
+
|
| 156 |
+
result_df = result_df[
|
| 157 |
+
[col for col in result_df.columns.to_list() if col in display_cols]
|
| 158 |
+
]
|
| 159 |
if verbose != 0:
|
| 160 |
print(result_df)
|
| 161 |
|
b3clf/utils.py
CHANGED
|
@@ -89,9 +89,9 @@ def scale_descriptors(df):
|
|
| 89 |
dirname = os.path.dirname(__file__)
|
| 90 |
filename = os.path.join(dirname, "pre_trained", "b3clf_scaler.joblib")
|
| 91 |
b3db_scaler = load(filename)
|
| 92 |
-
|
| 93 |
|
| 94 |
-
return
|
| 95 |
|
| 96 |
|
| 97 |
def get_clf(clf_str, sampling_str):
|
|
@@ -125,7 +125,9 @@ def get_clf(clf_str, sampling_str):
|
|
| 125 |
return clf
|
| 126 |
|
| 127 |
|
| 128 |
-
def predict_permeability(
|
|
|
|
|
|
|
| 129 |
"""Compute and store BBB predicted label and predicted probability to results dataframe."""
|
| 130 |
|
| 131 |
# load the threshold data
|
|
@@ -133,18 +135,21 @@ def predict_permeability(clf_str, sampling_str, features_df, info_df, threshold=
|
|
| 133 |
fpath_thres = os.path.join(dirname, "data", "B3clf_thresholds.xlsx")
|
| 134 |
df_thres = pd.read_excel(fpath_thres, index_col=0, engine="openpyxl")
|
| 135 |
# default threshold is 0.5
|
| 136 |
-
label_pool = np.zeros(
|
| 137 |
|
| 138 |
# get the classifier
|
| 139 |
clf = get_clf(clf_str=clf_str, sampling_str=sampling_str)
|
| 140 |
|
| 141 |
-
if
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
|
|
|
| 145 |
|
| 146 |
# get predicted probabilities
|
| 147 |
-
info_df.loc[:, "B3clf_predicted_probability"] = clf.predict_proba(
|
|
|
|
|
|
|
| 148 |
# get predicted label from probability using the threshold
|
| 149 |
mask = np.greater_equal(
|
| 150 |
info_df["B3clf_predicted_probability"].to_numpy(),
|
|
|
|
| 89 |
dirname = os.path.dirname(__file__)
|
| 90 |
filename = os.path.join(dirname, "pre_trained", "b3clf_scaler.joblib")
|
| 91 |
b3db_scaler = load(filename)
|
| 92 |
+
df_new = b3db_scaler.transform(df)
|
| 93 |
|
| 94 |
+
return df_new
|
| 95 |
|
| 96 |
|
| 97 |
def get_clf(clf_str, sampling_str):
|
|
|
|
| 125 |
return clf
|
| 126 |
|
| 127 |
|
| 128 |
+
def predict_permeability(
|
| 129 |
+
clf_str, sampling_str, mol_features, info_df, threshold="none"
|
| 130 |
+
):
|
| 131 |
"""Compute and store BBB predicted label and predicted probability to results dataframe."""
|
| 132 |
|
| 133 |
# load the threshold data
|
|
|
|
| 135 |
fpath_thres = os.path.join(dirname, "data", "B3clf_thresholds.xlsx")
|
| 136 |
df_thres = pd.read_excel(fpath_thres, index_col=0, engine="openpyxl")
|
| 137 |
# default threshold is 0.5
|
| 138 |
+
label_pool = np.zeros(mol_features.shape[0], dtype=int)
|
| 139 |
|
| 140 |
# get the classifier
|
| 141 |
clf = get_clf(clf_str=clf_str, sampling_str=sampling_str)
|
| 142 |
|
| 143 |
+
if type(mol_features) == pd.DataFrame:
|
| 144 |
+
if mol_features.index.tolist() != info_df.index.tolist():
|
| 145 |
+
raise ValueError(
|
| 146 |
+
"Features_df and Info_df do not have the same index. Internal processing error"
|
| 147 |
+
)
|
| 148 |
|
| 149 |
# get predicted probabilities
|
| 150 |
+
info_df.loc[:, "B3clf_predicted_probability"] = clf.predict_proba(mol_features)[
|
| 151 |
+
:, 1
|
| 152 |
+
]
|
| 153 |
# get predicted label from probability using the threshold
|
| 154 |
mask = np.greater_equal(
|
| 155 |
info_df["B3clf_predicted_probability"].to_numpy(),
|