| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| """ |
| Main B3clf Script. |
| """ |
|
|
| |
| import os |
|
|
| import numpy as np |
| from .descriptor_padel import compute_descriptors |
| from .geometry_opt import geometry_optimize |
| from .utils import ( |
| get_descriptors, |
| predict_permeability, |
| scale_descriptors, |
| select_descriptors, |
| ) |
|
|
| __all__ = [ |
| "b3clf", |
| ] |
|
|
|
|
| def b3clf( |
| mol_in, |
| sep="\s+|\t+", |
| clf="xgb", |
| sampling="classic_ADASYN", |
| output="B3clf_output.xlsx", |
| verbose=1, |
| random_seed=42, |
| time_per_mol=-1, |
| keep_features="no", |
| keep_sdf="no", |
| threshold="none", |
| ): |
| """Use B3clf for BBB classifications with resampling strategies. |
| |
| Parameters |
| ---------- |
| mol_in : str |
| Input molecule text fie which can be SMILES strings (file extension with .smi or .csv) or |
| SDF file format. No space is allowed for molecular name if input is a file with SMILES strings. |
| sep : str, optional |
| Separator used to parse data if a text file with SMILES strings is provided. |
| Default="\s+|\t+" which will take any space and any tab as delimiter. |
| clf: str, optional |
| Classification algorithm, which can be "dtree" for decision trees, "knn" for kNN, "logreg" |
| for logistical regression and "xgb" for XGBoost. Default="xgb". |
| sampling : str, optional |
| Sampling strategies that can be used which includes "common", |
| "RandUndersampling", "SMOTE", "borderline_SMOTE", "kmeans_SMOTE" and "classic_ADASYN". The |
| "common" denotes that no resampling strategy is employed. Default="classic_ADASYN". |
| output : str, optional |
| Output file name for the predicted results consisting molecule ID, predicted probability |
| and labels for BBB permeability. |
| verbose : int, optional |
| When verbose is zero, no results are printed out. Otherwise, the program prints the |
| predictions. Default=1. |
| random_seed : int, optional |
| Random seed for reproducibility. Default=42. |
| time_per_mol : int, optional |
| Time limit for each molecule in seconds. Default=-1, which means no time limit. |
| keep_features : str, optional |
| To keep intermediate molecular feature file, "yes" or "no". Default="no". |
| keep_sdf : str, optional |
| To keep intermediate molecular geometry file with 3D coordinates, "yes" or "no". |
| Default="no". |
| threshold : str, optional |
| To set the threshold for the predicted probability which can be "none". "J_threshold" and |
| "F_threshold". "J_threshold" will use threshold optimized from Youden’s J statistic. |
| "F_threshold" will use threshold optimized from F score. Default="none". |
| |
| Returns |
| ------- |
| result_df : pandas.DataFrame |
| Result of BBB predictions with molecule ID/name, predicted probability and predicted labels. |
| |
| """ |
|
|
| |
| if random_seed is not None: |
| rng = np.random.default_rng(random_seed) |
|
|
| mol_tag = os.path.basename(mol_in).split(".")[0] |
|
|
| features_out = f"{mol_tag}_padel_descriptors.xlsx" |
| internal_sdf = f"{mol_tag}_optimized_3d.sdf" |
|
|
| |
| |
| |
|
|
| geometry_optimize(input_fname=mol_in, output_sdf=internal_sdf, sep=sep) |
|
|
| _ = compute_descriptors( |
| sdf_file=internal_sdf, |
| excel_out=features_out, |
| output_csv=None, |
| timeout=None, |
| time_per_molecule=time_per_mol, |
| ) |
|
|
| |
| X_features, info_df = get_descriptors(df=features_out) |
| |
|
|
| |
| X_features = select_descriptors(df=X_features) |
|
|
| |
| X_features = scale_descriptors(df=X_features) |
|
|
| |
| |
|
|
| |
| result_df = predict_permeability( |
| clf_str=clf, |
| sampling_str=sampling, |
| mol_features=X_features, |
| info_df=info_df, |
| threshold=threshold, |
| ) |
|
|
| |
| display_cols = [ |
| "ID", |
| "SMILES", |
| "B3clf_predicted_probability", |
| "B3clf_predicted_label", |
| ] |
|
|
| result_df = result_df[ |
| [col for col in result_df.columns.to_list() if col in display_cols] |
| ] |
| if verbose != 0: |
| print(result_df) |
|
|
| result_df.to_excel(output, index=None, engine="openpyxl") |
|
|
| if keep_features != "yes": |
| os.remove(features_out) |
| if keep_sdf != "yes": |
| os.remove(internal_sdf) |
|
|
| return result_df |
|
|