Upload 13 files
Browse files- .gitattributes +2 -0
- LICENSE +21 -0
- README.md +6 -0
- catboost/adjacency_preprocess.ipynb +0 -0
- catboost/datasets/filtered_dataset.csv +0 -0
- catboost/datasets/interactions.json +3 -0
- catboost/inference.py +37 -0
- catboost/models/catboost_model.cbm +0 -0
- catboost/models/catboost_model2.cbm +3 -0
- catboost/preprocess_catboost.ipynb +1858 -0
- catboost/train.py +66 -0
- dimensionality_reduction.ipynb +0 -0
- link_prediction.ipynb +1482 -0
- parse.ipynb +698 -0
.gitattributes
CHANGED
|
@@ -32,3 +32,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 32 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 33 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
| 32 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 33 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
catboost/datasets/interactions.json filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
catboost/models/catboost_model2.cbm filter=lfs diff=lfs merge=lfs -text
|
LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
MIT License
|
| 2 |
+
|
| 3 |
+
Copyright (c) 2023 BP Rimal
|
| 4 |
+
|
| 5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
| 6 |
+
of this software and associated documentation files (the "Software"), to deal
|
| 7 |
+
in the Software without restriction, including without limitation the rights
|
| 8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
| 9 |
+
copies of the Software, and to permit persons to whom the Software is
|
| 10 |
+
furnished to do so, subject to the following conditions:
|
| 11 |
+
|
| 12 |
+
The above copyright notice and this permission notice shall be included in all
|
| 13 |
+
copies or substantial portions of the Software.
|
| 14 |
+
|
| 15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
| 16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
| 17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
| 18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
| 19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
| 20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
| 21 |
+
SOFTWARE.
|
README.md
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Drug-Drug-Interaction-Classification
|
| 2 |
+
Drug to Drug Interaction Classifier
|
| 3 |
+
|
| 4 |
+
An innovative approach was developed to address a crucial challenge in drug-drug interaction research. While existing state of the art link prediction models rely on prior knowledge of a drug's interaction with other drugs, our solution utilizes the CatBoost to classify potential interactions based solely on intrinsic properties.
|
| 5 |
+
|
| 6 |
+
We developed a new method for predicting drug interactions using the CatBoost algorithm that relies solely on intrinsic properties, rather than prior knowledge of a drug's interactions. We achieved a high accuracy of 0.85 and an AUC-ROC score of 0.86. This breakthrough provides a more efficient and cost-effective approach to predicting drug interactions, particularly for new drugs without prior interaction data.
|
catboost/adjacency_preprocess.ipynb
ADDED
|
File without changes
|
catboost/datasets/filtered_dataset.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
catboost/datasets/interactions.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1ef9c2162886244241c98e32cba62fd929f18f453273209c15408545b7c33b5c
|
| 3 |
+
size 32756785
|
catboost/inference.py
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
from sklearn.model_selection import train_test_split
|
| 3 |
+
from catboost import CatBoostClassifier, Pool
|
| 4 |
+
from sklearn.metrics import roc_auc_score
|
| 5 |
+
from sklearn.metrics import accuracy_score
|
| 6 |
+
from pandas.core.common import random_state
|
| 7 |
+
import numpy as np
|
| 8 |
+
|
| 9 |
+
# load catboost_df
|
| 10 |
+
catboost_df = pd.read_csv('datasets/catboost_df.csv', index_col=0)
|
| 11 |
+
# drop label name_x and name_y
|
| 12 |
+
catboost_df = catboost_df.drop(['name_x', 'name_y'], axis=1)
|
| 13 |
+
# get the categorical and float features
|
| 14 |
+
cat_features = list(catboost_df.select_dtypes(include=['object']).columns)
|
| 15 |
+
float_features = list(catboost_df.select_dtypes(include=['float64']).columns)
|
| 16 |
+
|
| 17 |
+
for feature in float_features:
|
| 18 |
+
# Fill NaN values with the mean of non-missing values in the same column
|
| 19 |
+
mean_value = catboost_df[feature].mean()
|
| 20 |
+
catboost_df[feature].fillna(mean_value, inplace=True)
|
| 21 |
+
|
| 22 |
+
for feature in cat_features:
|
| 23 |
+
catboost_df[feature] = catboost_df[feature].astype(str)
|
| 24 |
+
|
| 25 |
+
# create test and train set
|
| 26 |
+
X, y = catboost_df.drop('interaction', axis=1), catboost_df['interaction']
|
| 27 |
+
X_train, X_test, y_train, y_test = train_test_split(
|
| 28 |
+
X, y, test_size=0.6, random_state=42)
|
| 29 |
+
|
| 30 |
+
inference = CatBoostClassifier()
|
| 31 |
+
inference.load_model("models/catboost_model2.cbm")
|
| 32 |
+
|
| 33 |
+
y_pred = inference.predict_proba(X_test)
|
| 34 |
+
y_pred = y_pred[:, 1]
|
| 35 |
+
y_pred_binary = np.where(y_pred > 0.5, 1, 0)
|
| 36 |
+
print(f"Test AUC_ROC score = {roc_auc_score(y_test, y_pred)}")
|
| 37 |
+
print(f"Accuracy Score= {accuracy_score(y_test, y_pred_binary)}")
|
catboost/models/catboost_model.cbm
ADDED
|
Binary file (303 kB). View file
|
|
|
catboost/models/catboost_model2.cbm
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:61dc8e9631740294617bf90947493b3955b6032e453cd3aa7e9a8f9d28d7f292
|
| 3 |
+
size 1186180
|
catboost/preprocess_catboost.ipynb
ADDED
|
@@ -0,0 +1,1858 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "code",
|
| 5 |
+
"execution_count": 2,
|
| 6 |
+
"metadata": {},
|
| 7 |
+
"outputs": [],
|
| 8 |
+
"source": [
|
| 9 |
+
"import pandas as pd"
|
| 10 |
+
]
|
| 11 |
+
},
|
| 12 |
+
{
|
| 13 |
+
"cell_type": "code",
|
| 14 |
+
"execution_count": 7,
|
| 15 |
+
"metadata": {},
|
| 16 |
+
"outputs": [
|
| 17 |
+
{
|
| 18 |
+
"data": {
|
| 19 |
+
"text/html": [
|
| 20 |
+
"<div>\n",
|
| 21 |
+
"<style scoped>\n",
|
| 22 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
| 23 |
+
" vertical-align: middle;\n",
|
| 24 |
+
" }\n",
|
| 25 |
+
"\n",
|
| 26 |
+
" .dataframe tbody tr th {\n",
|
| 27 |
+
" vertical-align: top;\n",
|
| 28 |
+
" }\n",
|
| 29 |
+
"\n",
|
| 30 |
+
" .dataframe thead th {\n",
|
| 31 |
+
" text-align: right;\n",
|
| 32 |
+
" }\n",
|
| 33 |
+
"</style>\n",
|
| 34 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
| 35 |
+
" <thead>\n",
|
| 36 |
+
" <tr style=\"text-align: right;\">\n",
|
| 37 |
+
" <th></th>\n",
|
| 38 |
+
" <th>name</th>\n",
|
| 39 |
+
" <th>state</th>\n",
|
| 40 |
+
" <th>level4</th>\n",
|
| 41 |
+
" <th>level3</th>\n",
|
| 42 |
+
" <th>level2</th>\n",
|
| 43 |
+
" <th>level1</th>\n",
|
| 44 |
+
" <th>Molecular Weight</th>\n",
|
| 45 |
+
" <th>logP</th>\n",
|
| 46 |
+
" <th>Water Solubility</th>\n",
|
| 47 |
+
" <th>logS</th>\n",
|
| 48 |
+
" <th>...</th>\n",
|
| 49 |
+
" <th>Rotatable Bond Count</th>\n",
|
| 50 |
+
" <th>Polar Surface Area (PSA)</th>\n",
|
| 51 |
+
" <th>pKa (strongest basic)</th>\n",
|
| 52 |
+
" <th>Ghose Filter</th>\n",
|
| 53 |
+
" <th>Monoisotopic Weight</th>\n",
|
| 54 |
+
" <th>MDDR-Like Rule</th>\n",
|
| 55 |
+
" <th>Polarizability</th>\n",
|
| 56 |
+
" <th>H Bond Acceptor Count</th>\n",
|
| 57 |
+
" <th>Physiological Charge</th>\n",
|
| 58 |
+
" <th>Rule of Five</th>\n",
|
| 59 |
+
" </tr>\n",
|
| 60 |
+
" </thead>\n",
|
| 61 |
+
" <tbody>\n",
|
| 62 |
+
" <tr>\n",
|
| 63 |
+
" <th>5</th>\n",
|
| 64 |
+
" <td>Bivalirudin</td>\n",
|
| 65 |
+
" <td>solid</td>\n",
|
| 66 |
+
" <td>B01AE</td>\n",
|
| 67 |
+
" <td>B01A</td>\n",
|
| 68 |
+
" <td>B01</td>\n",
|
| 69 |
+
" <td>B</td>\n",
|
| 70 |
+
" <td>2180.2853</td>\n",
|
| 71 |
+
" <td>-14.00</td>\n",
|
| 72 |
+
" <td>0.04640</td>\n",
|
| 73 |
+
" <td>-4.7</td>\n",
|
| 74 |
+
" <td>...</td>\n",
|
| 75 |
+
" <td>66.0</td>\n",
|
| 76 |
+
" <td>901.57</td>\n",
|
| 77 |
+
" <td>11.88</td>\n",
|
| 78 |
+
" <td>0.0</td>\n",
|
| 79 |
+
" <td>2178.985813</td>\n",
|
| 80 |
+
" <td>1.0</td>\n",
|
| 81 |
+
" <td>218.54</td>\n",
|
| 82 |
+
" <td>37.0</td>\n",
|
| 83 |
+
" <td>-4.0</td>\n",
|
| 84 |
+
" <td>0.0</td>\n",
|
| 85 |
+
" </tr>\n",
|
| 86 |
+
" <tr>\n",
|
| 87 |
+
" <th>6</th>\n",
|
| 88 |
+
" <td>Leuprolide</td>\n",
|
| 89 |
+
" <td>solid</td>\n",
|
| 90 |
+
" <td>L02AE</td>\n",
|
| 91 |
+
" <td>L02A</td>\n",
|
| 92 |
+
" <td>L02</td>\n",
|
| 93 |
+
" <td>L</td>\n",
|
| 94 |
+
" <td>1209.3983</td>\n",
|
| 95 |
+
" <td>-2.40</td>\n",
|
| 96 |
+
" <td>0.03380</td>\n",
|
| 97 |
+
" <td>-4.6</td>\n",
|
| 98 |
+
" <td>...</td>\n",
|
| 99 |
+
" <td>32.0</td>\n",
|
| 100 |
+
" <td>429.04</td>\n",
|
| 101 |
+
" <td>11.92</td>\n",
|
| 102 |
+
" <td>0.0</td>\n",
|
| 103 |
+
" <td>1208.645462</td>\n",
|
| 104 |
+
" <td>1.0</td>\n",
|
| 105 |
+
" <td>125.24</td>\n",
|
| 106 |
+
" <td>16.0</td>\n",
|
| 107 |
+
" <td>1.0</td>\n",
|
| 108 |
+
" <td>0.0</td>\n",
|
| 109 |
+
" </tr>\n",
|
| 110 |
+
" <tr>\n",
|
| 111 |
+
" <th>13</th>\n",
|
| 112 |
+
" <td>Goserelin</td>\n",
|
| 113 |
+
" <td>solid</td>\n",
|
| 114 |
+
" <td>L02AE</td>\n",
|
| 115 |
+
" <td>L02A</td>\n",
|
| 116 |
+
" <td>L02</td>\n",
|
| 117 |
+
" <td>L</td>\n",
|
| 118 |
+
" <td>1269.4105</td>\n",
|
| 119 |
+
" <td>-5.10</td>\n",
|
| 120 |
+
" <td>0.02830</td>\n",
|
| 121 |
+
" <td>-4.6</td>\n",
|
| 122 |
+
" <td>...</td>\n",
|
| 123 |
+
" <td>33.0</td>\n",
|
| 124 |
+
" <td>495.89</td>\n",
|
| 125 |
+
" <td>10.91</td>\n",
|
| 126 |
+
" <td>0.0</td>\n",
|
| 127 |
+
" <td>1268.641439</td>\n",
|
| 128 |
+
" <td>1.0</td>\n",
|
| 129 |
+
" <td>130.74</td>\n",
|
| 130 |
+
" <td>18.0</td>\n",
|
| 131 |
+
" <td>1.0</td>\n",
|
| 132 |
+
" <td>0.0</td>\n",
|
| 133 |
+
" </tr>\n",
|
| 134 |
+
" <tr>\n",
|
| 135 |
+
" <th>25</th>\n",
|
| 136 |
+
" <td>Gramicidin D</td>\n",
|
| 137 |
+
" <td>liquid</td>\n",
|
| 138 |
+
" <td>R02AB</td>\n",
|
| 139 |
+
" <td>R02A</td>\n",
|
| 140 |
+
" <td>R02</td>\n",
|
| 141 |
+
" <td>R</td>\n",
|
| 142 |
+
" <td>1811.2530</td>\n",
|
| 143 |
+
" <td>5.96</td>\n",
|
| 144 |
+
" <td>0.00390</td>\n",
|
| 145 |
+
" <td>-5.7</td>\n",
|
| 146 |
+
" <td>...</td>\n",
|
| 147 |
+
" <td>50.0</td>\n",
|
| 148 |
+
" <td>519.89</td>\n",
|
| 149 |
+
" <td>NaN</td>\n",
|
| 150 |
+
" <td>0.0</td>\n",
|
| 151 |
+
" <td>1810.033419</td>\n",
|
| 152 |
+
" <td>1.0</td>\n",
|
| 153 |
+
" <td>194.73</td>\n",
|
| 154 |
+
" <td>16.0</td>\n",
|
| 155 |
+
" <td>0.0</td>\n",
|
| 156 |
+
" <td>0.0</td>\n",
|
| 157 |
+
" </tr>\n",
|
| 158 |
+
" <tr>\n",
|
| 159 |
+
" <th>33</th>\n",
|
| 160 |
+
" <td>Desmopressin</td>\n",
|
| 161 |
+
" <td>solid</td>\n",
|
| 162 |
+
" <td>H01BA</td>\n",
|
| 163 |
+
" <td>H01B</td>\n",
|
| 164 |
+
" <td>H01</td>\n",
|
| 165 |
+
" <td>H</td>\n",
|
| 166 |
+
" <td>1069.2200</td>\n",
|
| 167 |
+
" <td>-6.10</td>\n",
|
| 168 |
+
" <td>0.11000</td>\n",
|
| 169 |
+
" <td>-4.0</td>\n",
|
| 170 |
+
" <td>...</td>\n",
|
| 171 |
+
" <td>19.0</td>\n",
|
| 172 |
+
" <td>435.41</td>\n",
|
| 173 |
+
" <td>11.77</td>\n",
|
| 174 |
+
" <td>0.0</td>\n",
|
| 175 |
+
" <td>1068.426956</td>\n",
|
| 176 |
+
" <td>1.0</td>\n",
|
| 177 |
+
" <td>104.78</td>\n",
|
| 178 |
+
" <td>15.0</td>\n",
|
| 179 |
+
" <td>1.0</td>\n",
|
| 180 |
+
" <td>0.0</td>\n",
|
| 181 |
+
" </tr>\n",
|
| 182 |
+
" <tr>\n",
|
| 183 |
+
" <th>47</th>\n",
|
| 184 |
+
" <td>Cetrorelix</td>\n",
|
| 185 |
+
" <td>solid</td>\n",
|
| 186 |
+
" <td>H01CC</td>\n",
|
| 187 |
+
" <td>H01C</td>\n",
|
| 188 |
+
" <td>H01</td>\n",
|
| 189 |
+
" <td>H</td>\n",
|
| 190 |
+
" <td>1431.0380</td>\n",
|
| 191 |
+
" <td>-1.70</td>\n",
|
| 192 |
+
" <td>0.00694</td>\n",
|
| 193 |
+
" <td>-5.3</td>\n",
|
| 194 |
+
" <td>...</td>\n",
|
| 195 |
+
" <td>38.0</td>\n",
|
| 196 |
+
" <td>495.67</td>\n",
|
| 197 |
+
" <td>11.79</td>\n",
|
| 198 |
+
" <td>0.0</td>\n",
|
| 199 |
+
" <td>1429.669818</td>\n",
|
| 200 |
+
" <td>1.0</td>\n",
|
| 201 |
+
" <td>148.93</td>\n",
|
| 202 |
+
" <td>18.0</td>\n",
|
| 203 |
+
" <td>1.0</td>\n",
|
| 204 |
+
" <td>0.0</td>\n",
|
| 205 |
+
" </tr>\n",
|
| 206 |
+
" <tr>\n",
|
| 207 |
+
" <th>74</th>\n",
|
| 208 |
+
" <td>Daptomycin</td>\n",
|
| 209 |
+
" <td>solid</td>\n",
|
| 210 |
+
" <td>J01XX</td>\n",
|
| 211 |
+
" <td>J01X</td>\n",
|
| 212 |
+
" <td>J01</td>\n",
|
| 213 |
+
" <td>J</td>\n",
|
| 214 |
+
" <td>1620.6930</td>\n",
|
| 215 |
+
" <td>-9.40</td>\n",
|
| 216 |
+
" <td>0.01730</td>\n",
|
| 217 |
+
" <td>-5.0</td>\n",
|
| 218 |
+
" <td>...</td>\n",
|
| 219 |
+
" <td>35.0</td>\n",
|
| 220 |
+
" <td>702.02</td>\n",
|
| 221 |
+
" <td>9.59</td>\n",
|
| 222 |
+
" <td>0.0</td>\n",
|
| 223 |
+
" <td>1619.710366</td>\n",
|
| 224 |
+
" <td>1.0</td>\n",
|
| 225 |
+
" <td>158.96</td>\n",
|
| 226 |
+
" <td>27.0</td>\n",
|
| 227 |
+
" <td>-3.0</td>\n",
|
| 228 |
+
" <td>0.0</td>\n",
|
| 229 |
+
" </tr>\n",
|
| 230 |
+
" <tr>\n",
|
| 231 |
+
" <th>97</th>\n",
|
| 232 |
+
" <td>Abarelix</td>\n",
|
| 233 |
+
" <td>solid</td>\n",
|
| 234 |
+
" <td>L02BX</td>\n",
|
| 235 |
+
" <td>L02B</td>\n",
|
| 236 |
+
" <td>L02</td>\n",
|
| 237 |
+
" <td>L</td>\n",
|
| 238 |
+
" <td>1416.0900</td>\n",
|
| 239 |
+
" <td>-0.46</td>\n",
|
| 240 |
+
" <td>0.00371</td>\n",
|
| 241 |
+
" <td>-5.6</td>\n",
|
| 242 |
+
" <td>...</td>\n",
|
| 243 |
+
" <td>38.0</td>\n",
|
| 244 |
+
" <td>424.98</td>\n",
|
| 245 |
+
" <td>10.66</td>\n",
|
| 246 |
+
" <td>0.0</td>\n",
|
| 247 |
+
" <td>1414.684072</td>\n",
|
| 248 |
+
" <td>1.0</td>\n",
|
| 249 |
+
" <td>149.31</td>\n",
|
| 250 |
+
" <td>16.0</td>\n",
|
| 251 |
+
" <td>1.0</td>\n",
|
| 252 |
+
" <td>0.0</td>\n",
|
| 253 |
+
" </tr>\n",
|
| 254 |
+
" <tr>\n",
|
| 255 |
+
" <th>105</th>\n",
|
| 256 |
+
" <td>Pyridoxal phosphate</td>\n",
|
| 257 |
+
" <td>solid</td>\n",
|
| 258 |
+
" <td>A11HA</td>\n",
|
| 259 |
+
" <td>A11H</td>\n",
|
| 260 |
+
" <td>A11</td>\n",
|
| 261 |
+
" <td>A</td>\n",
|
| 262 |
+
" <td>247.1419</td>\n",
|
| 263 |
+
" <td>-2.10</td>\n",
|
| 264 |
+
" <td>5.70000</td>\n",
|
| 265 |
+
" <td>-1.6</td>\n",
|
| 266 |
+
" <td>...</td>\n",
|
| 267 |
+
" <td>4.0</td>\n",
|
| 268 |
+
" <td>116.95</td>\n",
|
| 269 |
+
" <td>4.11</td>\n",
|
| 270 |
+
" <td>0.0</td>\n",
|
| 271 |
+
" <td>247.024574</td>\n",
|
| 272 |
+
" <td>0.0</td>\n",
|
| 273 |
+
" <td>20.90</td>\n",
|
| 274 |
+
" <td>6.0</td>\n",
|
| 275 |
+
" <td>-2.0</td>\n",
|
| 276 |
+
" <td>1.0</td>\n",
|
| 277 |
+
" </tr>\n",
|
| 278 |
+
" <tr>\n",
|
| 279 |
+
" <th>106</th>\n",
|
| 280 |
+
" <td>Cyanocobalamin</td>\n",
|
| 281 |
+
" <td>solid</td>\n",
|
| 282 |
+
" <td>B03BA</td>\n",
|
| 283 |
+
" <td>B03B</td>\n",
|
| 284 |
+
" <td>B03</td>\n",
|
| 285 |
+
" <td>B</td>\n",
|
| 286 |
+
" <td>1355.3652</td>\n",
|
| 287 |
+
" <td>-3.20</td>\n",
|
| 288 |
+
" <td>0.02020</td>\n",
|
| 289 |
+
" <td>-4.8</td>\n",
|
| 290 |
+
" <td>...</td>\n",
|
| 291 |
+
" <td>27.0</td>\n",
|
| 292 |
+
" <td>477.85</td>\n",
|
| 293 |
+
" <td>8.68</td>\n",
|
| 294 |
+
" <td>0.0</td>\n",
|
| 295 |
+
" <td>1354.567405</td>\n",
|
| 296 |
+
" <td>1.0</td>\n",
|
| 297 |
+
" <td>138.79</td>\n",
|
| 298 |
+
" <td>18.0</td>\n",
|
| 299 |
+
" <td>3.0</td>\n",
|
| 300 |
+
" <td>0.0</td>\n",
|
| 301 |
+
" </tr>\n",
|
| 302 |
+
" </tbody>\n",
|
| 303 |
+
"</table>\n",
|
| 304 |
+
"<p>10 rows × 25 columns</p>\n",
|
| 305 |
+
"</div>"
|
| 306 |
+
],
|
| 307 |
+
"text/plain": [
|
| 308 |
+
" name state level4 level3 level2 level1 \\\n",
|
| 309 |
+
"5 Bivalirudin solid B01AE B01A B01 B \n",
|
| 310 |
+
"6 Leuprolide solid L02AE L02A L02 L \n",
|
| 311 |
+
"13 Goserelin solid L02AE L02A L02 L \n",
|
| 312 |
+
"25 Gramicidin D liquid R02AB R02A R02 R \n",
|
| 313 |
+
"33 Desmopressin solid H01BA H01B H01 H \n",
|
| 314 |
+
"47 Cetrorelix solid H01CC H01C H01 H \n",
|
| 315 |
+
"74 Daptomycin solid J01XX J01X J01 J \n",
|
| 316 |
+
"97 Abarelix solid L02BX L02B L02 L \n",
|
| 317 |
+
"105 Pyridoxal phosphate solid A11HA A11H A11 A \n",
|
| 318 |
+
"106 Cyanocobalamin solid B03BA B03B B03 B \n",
|
| 319 |
+
"\n",
|
| 320 |
+
" Molecular Weight logP Water Solubility logS ... \\\n",
|
| 321 |
+
"5 2180.2853 -14.00 0.04640 -4.7 ... \n",
|
| 322 |
+
"6 1209.3983 -2.40 0.03380 -4.6 ... \n",
|
| 323 |
+
"13 1269.4105 -5.10 0.02830 -4.6 ... \n",
|
| 324 |
+
"25 1811.2530 5.96 0.00390 -5.7 ... \n",
|
| 325 |
+
"33 1069.2200 -6.10 0.11000 -4.0 ... \n",
|
| 326 |
+
"47 1431.0380 -1.70 0.00694 -5.3 ... \n",
|
| 327 |
+
"74 1620.6930 -9.40 0.01730 -5.0 ... \n",
|
| 328 |
+
"97 1416.0900 -0.46 0.00371 -5.6 ... \n",
|
| 329 |
+
"105 247.1419 -2.10 5.70000 -1.6 ... \n",
|
| 330 |
+
"106 1355.3652 -3.20 0.02020 -4.8 ... \n",
|
| 331 |
+
"\n",
|
| 332 |
+
" Rotatable Bond Count Polar Surface Area (PSA) pKa (strongest basic) \\\n",
|
| 333 |
+
"5 66.0 901.57 11.88 \n",
|
| 334 |
+
"6 32.0 429.04 11.92 \n",
|
| 335 |
+
"13 33.0 495.89 10.91 \n",
|
| 336 |
+
"25 50.0 519.89 NaN \n",
|
| 337 |
+
"33 19.0 435.41 11.77 \n",
|
| 338 |
+
"47 38.0 495.67 11.79 \n",
|
| 339 |
+
"74 35.0 702.02 9.59 \n",
|
| 340 |
+
"97 38.0 424.98 10.66 \n",
|
| 341 |
+
"105 4.0 116.95 4.11 \n",
|
| 342 |
+
"106 27.0 477.85 8.68 \n",
|
| 343 |
+
"\n",
|
| 344 |
+
" Ghose Filter Monoisotopic Weight MDDR-Like Rule Polarizability \\\n",
|
| 345 |
+
"5 0.0 2178.985813 1.0 218.54 \n",
|
| 346 |
+
"6 0.0 1208.645462 1.0 125.24 \n",
|
| 347 |
+
"13 0.0 1268.641439 1.0 130.74 \n",
|
| 348 |
+
"25 0.0 1810.033419 1.0 194.73 \n",
|
| 349 |
+
"33 0.0 1068.426956 1.0 104.78 \n",
|
| 350 |
+
"47 0.0 1429.669818 1.0 148.93 \n",
|
| 351 |
+
"74 0.0 1619.710366 1.0 158.96 \n",
|
| 352 |
+
"97 0.0 1414.684072 1.0 149.31 \n",
|
| 353 |
+
"105 0.0 247.024574 0.0 20.90 \n",
|
| 354 |
+
"106 0.0 1354.567405 1.0 138.79 \n",
|
| 355 |
+
"\n",
|
| 356 |
+
" H Bond Acceptor Count Physiological Charge Rule of Five \n",
|
| 357 |
+
"5 37.0 -4.0 0.0 \n",
|
| 358 |
+
"6 16.0 1.0 0.0 \n",
|
| 359 |
+
"13 18.0 1.0 0.0 \n",
|
| 360 |
+
"25 16.0 0.0 0.0 \n",
|
| 361 |
+
"33 15.0 1.0 0.0 \n",
|
| 362 |
+
"47 18.0 1.0 0.0 \n",
|
| 363 |
+
"74 27.0 -3.0 0.0 \n",
|
| 364 |
+
"97 16.0 1.0 0.0 \n",
|
| 365 |
+
"105 6.0 -2.0 1.0 \n",
|
| 366 |
+
"106 18.0 3.0 0.0 \n",
|
| 367 |
+
"\n",
|
| 368 |
+
"[10 rows x 25 columns]"
|
| 369 |
+
]
|
| 370 |
+
},
|
| 371 |
+
"execution_count": 7,
|
| 372 |
+
"metadata": {},
|
| 373 |
+
"output_type": "execute_result"
|
| 374 |
+
}
|
| 375 |
+
],
|
| 376 |
+
"source": [
|
| 377 |
+
"# drop the first column\n",
|
| 378 |
+
"df = pd.read_csv('datasets/filtered_dataset.csv', index_col=0)\n",
|
| 379 |
+
"df.head(10)"
|
| 380 |
+
]
|
| 381 |
+
},
|
| 382 |
+
{
|
| 383 |
+
"cell_type": "code",
|
| 384 |
+
"execution_count": 8,
|
| 385 |
+
"metadata": {},
|
| 386 |
+
"outputs": [
|
| 387 |
+
{
|
| 388 |
+
"data": {
|
| 389 |
+
"text/html": [
|
| 390 |
+
"<div>\n",
|
| 391 |
+
"<style scoped>\n",
|
| 392 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
| 393 |
+
" vertical-align: middle;\n",
|
| 394 |
+
" }\n",
|
| 395 |
+
"\n",
|
| 396 |
+
" .dataframe tbody tr th {\n",
|
| 397 |
+
" vertical-align: top;\n",
|
| 398 |
+
" }\n",
|
| 399 |
+
"\n",
|
| 400 |
+
" .dataframe thead th {\n",
|
| 401 |
+
" text-align: right;\n",
|
| 402 |
+
" }\n",
|
| 403 |
+
"</style>\n",
|
| 404 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
| 405 |
+
" <thead>\n",
|
| 406 |
+
" <tr style=\"text-align: right;\">\n",
|
| 407 |
+
" <th></th>\n",
|
| 408 |
+
" <th>name_x</th>\n",
|
| 409 |
+
" <th>state_x</th>\n",
|
| 410 |
+
" <th>level4_x</th>\n",
|
| 411 |
+
" <th>level3_x</th>\n",
|
| 412 |
+
" <th>level2_x</th>\n",
|
| 413 |
+
" <th>level1_x</th>\n",
|
| 414 |
+
" <th>Molecular Weight_x</th>\n",
|
| 415 |
+
" <th>logP_x</th>\n",
|
| 416 |
+
" <th>Water Solubility_x</th>\n",
|
| 417 |
+
" <th>logS_x</th>\n",
|
| 418 |
+
" <th>...</th>\n",
|
| 419 |
+
" <th>Rotatable Bond Count_y</th>\n",
|
| 420 |
+
" <th>Polar Surface Area (PSA)_y</th>\n",
|
| 421 |
+
" <th>pKa (strongest basic)_y</th>\n",
|
| 422 |
+
" <th>Ghose Filter_y</th>\n",
|
| 423 |
+
" <th>Monoisotopic Weight_y</th>\n",
|
| 424 |
+
" <th>MDDR-Like Rule_y</th>\n",
|
| 425 |
+
" <th>Polarizability_y</th>\n",
|
| 426 |
+
" <th>H Bond Acceptor Count_y</th>\n",
|
| 427 |
+
" <th>Physiological Charge_y</th>\n",
|
| 428 |
+
" <th>Rule of Five_y</th>\n",
|
| 429 |
+
" </tr>\n",
|
| 430 |
+
" </thead>\n",
|
| 431 |
+
" <tbody>\n",
|
| 432 |
+
" <tr>\n",
|
| 433 |
+
" <th>0</th>\n",
|
| 434 |
+
" <td>Bivalirudin</td>\n",
|
| 435 |
+
" <td>solid</td>\n",
|
| 436 |
+
" <td>B01AE</td>\n",
|
| 437 |
+
" <td>B01A</td>\n",
|
| 438 |
+
" <td>B01</td>\n",
|
| 439 |
+
" <td>B</td>\n",
|
| 440 |
+
" <td>2180.2853</td>\n",
|
| 441 |
+
" <td>-14.0</td>\n",
|
| 442 |
+
" <td>0.0464</td>\n",
|
| 443 |
+
" <td>-4.7</td>\n",
|
| 444 |
+
" <td>...</td>\n",
|
| 445 |
+
" <td>66.0</td>\n",
|
| 446 |
+
" <td>901.57</td>\n",
|
| 447 |
+
" <td>11.88</td>\n",
|
| 448 |
+
" <td>0.0</td>\n",
|
| 449 |
+
" <td>2178.985813</td>\n",
|
| 450 |
+
" <td>1.0</td>\n",
|
| 451 |
+
" <td>218.54</td>\n",
|
| 452 |
+
" <td>37.0</td>\n",
|
| 453 |
+
" <td>-4.0</td>\n",
|
| 454 |
+
" <td>0.0</td>\n",
|
| 455 |
+
" </tr>\n",
|
| 456 |
+
" <tr>\n",
|
| 457 |
+
" <th>1</th>\n",
|
| 458 |
+
" <td>Bivalirudin</td>\n",
|
| 459 |
+
" <td>solid</td>\n",
|
| 460 |
+
" <td>B01AE</td>\n",
|
| 461 |
+
" <td>B01A</td>\n",
|
| 462 |
+
" <td>B01</td>\n",
|
| 463 |
+
" <td>B</td>\n",
|
| 464 |
+
" <td>2180.2853</td>\n",
|
| 465 |
+
" <td>-14.0</td>\n",
|
| 466 |
+
" <td>0.0464</td>\n",
|
| 467 |
+
" <td>-4.7</td>\n",
|
| 468 |
+
" <td>...</td>\n",
|
| 469 |
+
" <td>32.0</td>\n",
|
| 470 |
+
" <td>429.04</td>\n",
|
| 471 |
+
" <td>11.92</td>\n",
|
| 472 |
+
" <td>0.0</td>\n",
|
| 473 |
+
" <td>1208.645462</td>\n",
|
| 474 |
+
" <td>1.0</td>\n",
|
| 475 |
+
" <td>125.24</td>\n",
|
| 476 |
+
" <td>16.0</td>\n",
|
| 477 |
+
" <td>1.0</td>\n",
|
| 478 |
+
" <td>0.0</td>\n",
|
| 479 |
+
" </tr>\n",
|
| 480 |
+
" <tr>\n",
|
| 481 |
+
" <th>2</th>\n",
|
| 482 |
+
" <td>Bivalirudin</td>\n",
|
| 483 |
+
" <td>solid</td>\n",
|
| 484 |
+
" <td>B01AE</td>\n",
|
| 485 |
+
" <td>B01A</td>\n",
|
| 486 |
+
" <td>B01</td>\n",
|
| 487 |
+
" <td>B</td>\n",
|
| 488 |
+
" <td>2180.2853</td>\n",
|
| 489 |
+
" <td>-14.0</td>\n",
|
| 490 |
+
" <td>0.0464</td>\n",
|
| 491 |
+
" <td>-4.7</td>\n",
|
| 492 |
+
" <td>...</td>\n",
|
| 493 |
+
" <td>33.0</td>\n",
|
| 494 |
+
" <td>495.89</td>\n",
|
| 495 |
+
" <td>10.91</td>\n",
|
| 496 |
+
" <td>0.0</td>\n",
|
| 497 |
+
" <td>1268.641439</td>\n",
|
| 498 |
+
" <td>1.0</td>\n",
|
| 499 |
+
" <td>130.74</td>\n",
|
| 500 |
+
" <td>18.0</td>\n",
|
| 501 |
+
" <td>1.0</td>\n",
|
| 502 |
+
" <td>0.0</td>\n",
|
| 503 |
+
" </tr>\n",
|
| 504 |
+
" <tr>\n",
|
| 505 |
+
" <th>3</th>\n",
|
| 506 |
+
" <td>Bivalirudin</td>\n",
|
| 507 |
+
" <td>solid</td>\n",
|
| 508 |
+
" <td>B01AE</td>\n",
|
| 509 |
+
" <td>B01A</td>\n",
|
| 510 |
+
" <td>B01</td>\n",
|
| 511 |
+
" <td>B</td>\n",
|
| 512 |
+
" <td>2180.2853</td>\n",
|
| 513 |
+
" <td>-14.0</td>\n",
|
| 514 |
+
" <td>0.0464</td>\n",
|
| 515 |
+
" <td>-4.7</td>\n",
|
| 516 |
+
" <td>...</td>\n",
|
| 517 |
+
" <td>50.0</td>\n",
|
| 518 |
+
" <td>519.89</td>\n",
|
| 519 |
+
" <td>NaN</td>\n",
|
| 520 |
+
" <td>0.0</td>\n",
|
| 521 |
+
" <td>1810.033419</td>\n",
|
| 522 |
+
" <td>1.0</td>\n",
|
| 523 |
+
" <td>194.73</td>\n",
|
| 524 |
+
" <td>16.0</td>\n",
|
| 525 |
+
" <td>0.0</td>\n",
|
| 526 |
+
" <td>0.0</td>\n",
|
| 527 |
+
" </tr>\n",
|
| 528 |
+
" <tr>\n",
|
| 529 |
+
" <th>4</th>\n",
|
| 530 |
+
" <td>Bivalirudin</td>\n",
|
| 531 |
+
" <td>solid</td>\n",
|
| 532 |
+
" <td>B01AE</td>\n",
|
| 533 |
+
" <td>B01A</td>\n",
|
| 534 |
+
" <td>B01</td>\n",
|
| 535 |
+
" <td>B</td>\n",
|
| 536 |
+
" <td>2180.2853</td>\n",
|
| 537 |
+
" <td>-14.0</td>\n",
|
| 538 |
+
" <td>0.0464</td>\n",
|
| 539 |
+
" <td>-4.7</td>\n",
|
| 540 |
+
" <td>...</td>\n",
|
| 541 |
+
" <td>19.0</td>\n",
|
| 542 |
+
" <td>435.41</td>\n",
|
| 543 |
+
" <td>11.77</td>\n",
|
| 544 |
+
" <td>0.0</td>\n",
|
| 545 |
+
" <td>1068.426956</td>\n",
|
| 546 |
+
" <td>1.0</td>\n",
|
| 547 |
+
" <td>104.78</td>\n",
|
| 548 |
+
" <td>15.0</td>\n",
|
| 549 |
+
" <td>1.0</td>\n",
|
| 550 |
+
" <td>0.0</td>\n",
|
| 551 |
+
" </tr>\n",
|
| 552 |
+
" <tr>\n",
|
| 553 |
+
" <th>...</th>\n",
|
| 554 |
+
" <td>...</td>\n",
|
| 555 |
+
" <td>...</td>\n",
|
| 556 |
+
" <td>...</td>\n",
|
| 557 |
+
" <td>...</td>\n",
|
| 558 |
+
" <td>...</td>\n",
|
| 559 |
+
" <td>...</td>\n",
|
| 560 |
+
" <td>...</td>\n",
|
| 561 |
+
" <td>...</td>\n",
|
| 562 |
+
" <td>...</td>\n",
|
| 563 |
+
" <td>...</td>\n",
|
| 564 |
+
" <td>...</td>\n",
|
| 565 |
+
" <td>...</td>\n",
|
| 566 |
+
" <td>...</td>\n",
|
| 567 |
+
" <td>...</td>\n",
|
| 568 |
+
" <td>...</td>\n",
|
| 569 |
+
" <td>...</td>\n",
|
| 570 |
+
" <td>...</td>\n",
|
| 571 |
+
" <td>...</td>\n",
|
| 572 |
+
" <td>...</td>\n",
|
| 573 |
+
" <td>...</td>\n",
|
| 574 |
+
" <td>...</td>\n",
|
| 575 |
+
" </tr>\n",
|
| 576 |
+
" <tr>\n",
|
| 577 |
+
" <th>6916895</th>\n",
|
| 578 |
+
" <td>Methionine C-11</td>\n",
|
| 579 |
+
" <td>NaN</td>\n",
|
| 580 |
+
" <td>V09IX</td>\n",
|
| 581 |
+
" <td>V09I</td>\n",
|
| 582 |
+
" <td>V09</td>\n",
|
| 583 |
+
" <td>V</td>\n",
|
| 584 |
+
" <td>148.2100</td>\n",
|
| 585 |
+
" <td>-2.2</td>\n",
|
| 586 |
+
" <td>23.9000</td>\n",
|
| 587 |
+
" <td>-0.8</td>\n",
|
| 588 |
+
" <td>...</td>\n",
|
| 589 |
+
" <td>7.0</td>\n",
|
| 590 |
+
" <td>104.82</td>\n",
|
| 591 |
+
" <td>4.11</td>\n",
|
| 592 |
+
" <td>0.0</td>\n",
|
| 593 |
+
" <td>452.196074</td>\n",
|
| 594 |
+
" <td>1.0</td>\n",
|
| 595 |
+
" <td>49.55</td>\n",
|
| 596 |
+
" <td>6.0</td>\n",
|
| 597 |
+
" <td>0.0</td>\n",
|
| 598 |
+
" <td>1.0</td>\n",
|
| 599 |
+
" </tr>\n",
|
| 600 |
+
" <tr>\n",
|
| 601 |
+
" <th>6916896</th>\n",
|
| 602 |
+
" <td>Methionine C-11</td>\n",
|
| 603 |
+
" <td>NaN</td>\n",
|
| 604 |
+
" <td>V09IX</td>\n",
|
| 605 |
+
" <td>V09I</td>\n",
|
| 606 |
+
" <td>V09</td>\n",
|
| 607 |
+
" <td>V</td>\n",
|
| 608 |
+
" <td>148.2100</td>\n",
|
| 609 |
+
" <td>-2.2</td>\n",
|
| 610 |
+
" <td>23.9000</td>\n",
|
| 611 |
+
" <td>-0.8</td>\n",
|
| 612 |
+
" <td>...</td>\n",
|
| 613 |
+
" <td>9.0</td>\n",
|
| 614 |
+
" <td>108.74</td>\n",
|
| 615 |
+
" <td>6.27</td>\n",
|
| 616 |
+
" <td>0.0</td>\n",
|
| 617 |
+
" <td>497.165428</td>\n",
|
| 618 |
+
" <td>1.0</td>\n",
|
| 619 |
+
" <td>53.39</td>\n",
|
| 620 |
+
" <td>6.0</td>\n",
|
| 621 |
+
" <td>0.0</td>\n",
|
| 622 |
+
" <td>1.0</td>\n",
|
| 623 |
+
" </tr>\n",
|
| 624 |
+
" <tr>\n",
|
| 625 |
+
" <th>6916897</th>\n",
|
| 626 |
+
" <td>Methionine C-11</td>\n",
|
| 627 |
+
" <td>NaN</td>\n",
|
| 628 |
+
" <td>V09IX</td>\n",
|
| 629 |
+
" <td>V09I</td>\n",
|
| 630 |
+
" <td>V09</td>\n",
|
| 631 |
+
" <td>V</td>\n",
|
| 632 |
+
" <td>148.2100</td>\n",
|
| 633 |
+
" <td>-2.2</td>\n",
|
| 634 |
+
" <td>23.9000</td>\n",
|
| 635 |
+
" <td>-0.8</td>\n",
|
| 636 |
+
" <td>...</td>\n",
|
| 637 |
+
" <td>3.0</td>\n",
|
| 638 |
+
" <td>99.76</td>\n",
|
| 639 |
+
" <td>9.80</td>\n",
|
| 640 |
+
" <td>1.0</td>\n",
|
| 641 |
+
" <td>404.109625</td>\n",
|
| 642 |
+
" <td>0.0</td>\n",
|
| 643 |
+
" <td>37.18</td>\n",
|
| 644 |
+
" <td>7.0</td>\n",
|
| 645 |
+
" <td>0.0</td>\n",
|
| 646 |
+
" <td>1.0</td>\n",
|
| 647 |
+
" </tr>\n",
|
| 648 |
+
" <tr>\n",
|
| 649 |
+
" <th>6916898</th>\n",
|
| 650 |
+
" <td>Methionine C-11</td>\n",
|
| 651 |
+
" <td>NaN</td>\n",
|
| 652 |
+
" <td>V09IX</td>\n",
|
| 653 |
+
" <td>V09I</td>\n",
|
| 654 |
+
" <td>V09</td>\n",
|
| 655 |
+
" <td>V</td>\n",
|
| 656 |
+
" <td>148.2100</td>\n",
|
| 657 |
+
" <td>-2.2</td>\n",
|
| 658 |
+
" <td>23.9000</td>\n",
|
| 659 |
+
" <td>-0.8</td>\n",
|
| 660 |
+
" <td>...</td>\n",
|
| 661 |
+
" <td>6.0</td>\n",
|
| 662 |
+
" <td>114.40</td>\n",
|
| 663 |
+
" <td>-3.50</td>\n",
|
| 664 |
+
" <td>0.0</td>\n",
|
| 665 |
+
" <td>508.055206</td>\n",
|
| 666 |
+
" <td>1.0</td>\n",
|
| 667 |
+
" <td>45.39</td>\n",
|
| 668 |
+
" <td>7.0</td>\n",
|
| 669 |
+
" <td>-1.0</td>\n",
|
| 670 |
+
" <td>0.0</td>\n",
|
| 671 |
+
" </tr>\n",
|
| 672 |
+
" <tr>\n",
|
| 673 |
+
" <th>6916899</th>\n",
|
| 674 |
+
" <td>Methionine C-11</td>\n",
|
| 675 |
+
" <td>NaN</td>\n",
|
| 676 |
+
" <td>V09IX</td>\n",
|
| 677 |
+
" <td>V09I</td>\n",
|
| 678 |
+
" <td>V09</td>\n",
|
| 679 |
+
" <td>V</td>\n",
|
| 680 |
+
" <td>148.2100</td>\n",
|
| 681 |
+
" <td>-2.2</td>\n",
|
| 682 |
+
" <td>23.9000</td>\n",
|
| 683 |
+
" <td>-0.8</td>\n",
|
| 684 |
+
" <td>...</td>\n",
|
| 685 |
+
" <td>4.0</td>\n",
|
| 686 |
+
" <td>63.32</td>\n",
|
| 687 |
+
" <td>9.50</td>\n",
|
| 688 |
+
" <td>0.0</td>\n",
|
| 689 |
+
" <td>148.062484</td>\n",
|
| 690 |
+
" <td>0.0</td>\n",
|
| 691 |
+
" <td>15.54</td>\n",
|
| 692 |
+
" <td>3.0</td>\n",
|
| 693 |
+
" <td>0.0</td>\n",
|
| 694 |
+
" <td>1.0</td>\n",
|
| 695 |
+
" </tr>\n",
|
| 696 |
+
" </tbody>\n",
|
| 697 |
+
"</table>\n",
|
| 698 |
+
"<p>6916900 rows × 50 columns</p>\n",
|
| 699 |
+
"</div>"
|
| 700 |
+
],
|
| 701 |
+
"text/plain": [
|
| 702 |
+
" name_x state_x level4_x level3_x level2_x level1_x \\\n",
|
| 703 |
+
"0 Bivalirudin solid B01AE B01A B01 B \n",
|
| 704 |
+
"1 Bivalirudin solid B01AE B01A B01 B \n",
|
| 705 |
+
"2 Bivalirudin solid B01AE B01A B01 B \n",
|
| 706 |
+
"3 Bivalirudin solid B01AE B01A B01 B \n",
|
| 707 |
+
"4 Bivalirudin solid B01AE B01A B01 B \n",
|
| 708 |
+
"... ... ... ... ... ... ... \n",
|
| 709 |
+
"6916895 Methionine C-11 NaN V09IX V09I V09 V \n",
|
| 710 |
+
"6916896 Methionine C-11 NaN V09IX V09I V09 V \n",
|
| 711 |
+
"6916897 Methionine C-11 NaN V09IX V09I V09 V \n",
|
| 712 |
+
"6916898 Methionine C-11 NaN V09IX V09I V09 V \n",
|
| 713 |
+
"6916899 Methionine C-11 NaN V09IX V09I V09 V \n",
|
| 714 |
+
"\n",
|
| 715 |
+
" Molecular Weight_x logP_x Water Solubility_x logS_x ... \\\n",
|
| 716 |
+
"0 2180.2853 -14.0 0.0464 -4.7 ... \n",
|
| 717 |
+
"1 2180.2853 -14.0 0.0464 -4.7 ... \n",
|
| 718 |
+
"2 2180.2853 -14.0 0.0464 -4.7 ... \n",
|
| 719 |
+
"3 2180.2853 -14.0 0.0464 -4.7 ... \n",
|
| 720 |
+
"4 2180.2853 -14.0 0.0464 -4.7 ... \n",
|
| 721 |
+
"... ... ... ... ... ... \n",
|
| 722 |
+
"6916895 148.2100 -2.2 23.9000 -0.8 ... \n",
|
| 723 |
+
"6916896 148.2100 -2.2 23.9000 -0.8 ... \n",
|
| 724 |
+
"6916897 148.2100 -2.2 23.9000 -0.8 ... \n",
|
| 725 |
+
"6916898 148.2100 -2.2 23.9000 -0.8 ... \n",
|
| 726 |
+
"6916899 148.2100 -2.2 23.9000 -0.8 ... \n",
|
| 727 |
+
"\n",
|
| 728 |
+
" Rotatable Bond Count_y Polar Surface Area (PSA)_y \\\n",
|
| 729 |
+
"0 66.0 901.57 \n",
|
| 730 |
+
"1 32.0 429.04 \n",
|
| 731 |
+
"2 33.0 495.89 \n",
|
| 732 |
+
"3 50.0 519.89 \n",
|
| 733 |
+
"4 19.0 435.41 \n",
|
| 734 |
+
"... ... ... \n",
|
| 735 |
+
"6916895 7.0 104.82 \n",
|
| 736 |
+
"6916896 9.0 108.74 \n",
|
| 737 |
+
"6916897 3.0 99.76 \n",
|
| 738 |
+
"6916898 6.0 114.40 \n",
|
| 739 |
+
"6916899 4.0 63.32 \n",
|
| 740 |
+
"\n",
|
| 741 |
+
" pKa (strongest basic)_y Ghose Filter_y Monoisotopic Weight_y \\\n",
|
| 742 |
+
"0 11.88 0.0 2178.985813 \n",
|
| 743 |
+
"1 11.92 0.0 1208.645462 \n",
|
| 744 |
+
"2 10.91 0.0 1268.641439 \n",
|
| 745 |
+
"3 NaN 0.0 1810.033419 \n",
|
| 746 |
+
"4 11.77 0.0 1068.426956 \n",
|
| 747 |
+
"... ... ... ... \n",
|
| 748 |
+
"6916895 4.11 0.0 452.196074 \n",
|
| 749 |
+
"6916896 6.27 0.0 497.165428 \n",
|
| 750 |
+
"6916897 9.80 1.0 404.109625 \n",
|
| 751 |
+
"6916898 -3.50 0.0 508.055206 \n",
|
| 752 |
+
"6916899 9.50 0.0 148.062484 \n",
|
| 753 |
+
"\n",
|
| 754 |
+
" MDDR-Like Rule_y Polarizability_y H Bond Acceptor Count_y \\\n",
|
| 755 |
+
"0 1.0 218.54 37.0 \n",
|
| 756 |
+
"1 1.0 125.24 16.0 \n",
|
| 757 |
+
"2 1.0 130.74 18.0 \n",
|
| 758 |
+
"3 1.0 194.73 16.0 \n",
|
| 759 |
+
"4 1.0 104.78 15.0 \n",
|
| 760 |
+
"... ... ... ... \n",
|
| 761 |
+
"6916895 1.0 49.55 6.0 \n",
|
| 762 |
+
"6916896 1.0 53.39 6.0 \n",
|
| 763 |
+
"6916897 0.0 37.18 7.0 \n",
|
| 764 |
+
"6916898 1.0 45.39 7.0 \n",
|
| 765 |
+
"6916899 0.0 15.54 3.0 \n",
|
| 766 |
+
"\n",
|
| 767 |
+
" Physiological Charge_y Rule of Five_y \n",
|
| 768 |
+
"0 -4.0 0.0 \n",
|
| 769 |
+
"1 1.0 0.0 \n",
|
| 770 |
+
"2 1.0 0.0 \n",
|
| 771 |
+
"3 0.0 0.0 \n",
|
| 772 |
+
"4 1.0 0.0 \n",
|
| 773 |
+
"... ... ... \n",
|
| 774 |
+
"6916895 0.0 1.0 \n",
|
| 775 |
+
"6916896 0.0 1.0 \n",
|
| 776 |
+
"6916897 0.0 1.0 \n",
|
| 777 |
+
"6916898 -1.0 0.0 \n",
|
| 778 |
+
"6916899 0.0 1.0 \n",
|
| 779 |
+
"\n",
|
| 780 |
+
"[6916900 rows x 50 columns]"
|
| 781 |
+
]
|
| 782 |
+
},
|
| 783 |
+
"execution_count": 8,
|
| 784 |
+
"metadata": {},
|
| 785 |
+
"output_type": "execute_result"
|
| 786 |
+
}
|
| 787 |
+
],
|
| 788 |
+
"source": [
|
| 789 |
+
"# cross two datasets to get all drug pairs\n",
|
| 790 |
+
"df1 = pd.read_csv('datasets/filtered_dataset.csv', index_col=0)\n",
|
| 791 |
+
"df2 = pd.read_csv('datasets/filtered_dataset.csv', index_col=0)\n",
|
| 792 |
+
"\n",
|
| 793 |
+
"df3 = pd.merge(df1, df2, how='cross')\n",
|
| 794 |
+
"df3"
|
| 795 |
+
]
|
| 796 |
+
},
|
| 797 |
+
{
|
| 798 |
+
"cell_type": "code",
|
| 799 |
+
"execution_count": 9,
|
| 800 |
+
"metadata": {},
|
| 801 |
+
"outputs": [],
|
| 802 |
+
"source": [
|
| 803 |
+
"df3.to_csv('datasets/drug_pairs.csv')"
|
| 804 |
+
]
|
| 805 |
+
},
|
| 806 |
+
{
|
| 807 |
+
"cell_type": "code",
|
| 808 |
+
"execution_count": 18,
|
| 809 |
+
"metadata": {},
|
| 810 |
+
"outputs": [
|
| 811 |
+
{
|
| 812 |
+
"name": "stdout",
|
| 813 |
+
"output_type": "stream",
|
| 814 |
+
"text": [
|
| 815 |
+
"(6916900, 50) (2630, 25)\n"
|
| 816 |
+
]
|
| 817 |
+
}
|
| 818 |
+
],
|
| 819 |
+
"source": [
|
| 820 |
+
"print(df3.shape, df1.shape)"
|
| 821 |
+
]
|
| 822 |
+
},
|
| 823 |
+
{
|
| 824 |
+
"cell_type": "code",
|
| 825 |
+
"execution_count": null,
|
| 826 |
+
"metadata": {},
|
| 827 |
+
"outputs": [],
|
| 828 |
+
"source": [
|
| 829 |
+
"from itertools import combinations\n",
|
| 830 |
+
"drug_pairs = list(combinations(df['name'], 2))\n",
|
| 831 |
+
"\n",
|
| 832 |
+
"# Create an empty dataframe to store the pairwise combinations and features\n",
|
| 833 |
+
"col1 = [x+\"_d1\" for x in df.columns[1:]]\n",
|
| 834 |
+
"col2 = [x+\"_d2\" for x in df.columns[1:]]\n",
|
| 835 |
+
"\n",
|
| 836 |
+
"df_pairs = pd.DataFrame(columns=['drug1', 'drug2', *col1, *col2])\n",
|
| 837 |
+
"\n",
|
| 838 |
+
"# Iterate through the drug pairs and populate the dataframe\n",
|
| 839 |
+
"for drug1, drug2 in drug_pairs:\n",
|
| 840 |
+
" features_drug1 = df[df['name'] == drug1][[*(df.columns[1:])]].values.flatten()\n",
|
| 841 |
+
" features_drug2 = df[df['name'] == drug2][[*(df.columns[1:])]].values.flatten()\n",
|
| 842 |
+
" row = pd.DataFrame([[drug1, drug2, *features_drug1, *features_drug2]], columns=df_pairs.columns)\n",
|
| 843 |
+
" df_pairs = df_pairs.append(row, ignore_index=True)\n",
|
| 844 |
+
"\n",
|
| 845 |
+
"# Print the resulting pairwise combinations and features dataframe\n",
|
| 846 |
+
"print(df_pairs)\n"
|
| 847 |
+
]
|
| 848 |
+
},
|
| 849 |
+
{
|
| 850 |
+
"cell_type": "code",
|
| 851 |
+
"execution_count": 1,
|
| 852 |
+
"metadata": {},
|
| 853 |
+
"outputs": [
|
| 854 |
+
{
|
| 855 |
+
"name": "stderr",
|
| 856 |
+
"output_type": "stream",
|
| 857 |
+
"text": [
|
| 858 |
+
"/home/bprimal/.local/lib/python3.8/site-packages/IPython/core/interactiveshell.py:3442: DtypeWarning: Columns (2) have mixed types.Specify dtype option on import or set low_memory=False.\n",
|
| 859 |
+
" exec(code_obj, self.user_global_ns, self.user_ns)\n"
|
| 860 |
+
]
|
| 861 |
+
},
|
| 862 |
+
{
|
| 863 |
+
"data": {
|
| 864 |
+
"text/html": [
|
| 865 |
+
"<div>\n",
|
| 866 |
+
"<style scoped>\n",
|
| 867 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
| 868 |
+
" vertical-align: middle;\n",
|
| 869 |
+
" }\n",
|
| 870 |
+
"\n",
|
| 871 |
+
" .dataframe tbody tr th {\n",
|
| 872 |
+
" vertical-align: top;\n",
|
| 873 |
+
" }\n",
|
| 874 |
+
"\n",
|
| 875 |
+
" .dataframe thead th {\n",
|
| 876 |
+
" text-align: right;\n",
|
| 877 |
+
" }\n",
|
| 878 |
+
"</style>\n",
|
| 879 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
| 880 |
+
" <thead>\n",
|
| 881 |
+
" <tr style=\"text-align: right;\">\n",
|
| 882 |
+
" <th></th>\n",
|
| 883 |
+
" <th>name_x</th>\n",
|
| 884 |
+
" <th>state_x</th>\n",
|
| 885 |
+
" <th>level4_x</th>\n",
|
| 886 |
+
" <th>level3_x</th>\n",
|
| 887 |
+
" <th>level2_x</th>\n",
|
| 888 |
+
" <th>level1_x</th>\n",
|
| 889 |
+
" <th>Molecular Weight_x</th>\n",
|
| 890 |
+
" <th>logP_x</th>\n",
|
| 891 |
+
" <th>Water Solubility_x</th>\n",
|
| 892 |
+
" <th>logS_x</th>\n",
|
| 893 |
+
" <th>...</th>\n",
|
| 894 |
+
" <th>Rotatable Bond Count_y</th>\n",
|
| 895 |
+
" <th>Polar Surface Area (PSA)_y</th>\n",
|
| 896 |
+
" <th>pKa (strongest basic)_y</th>\n",
|
| 897 |
+
" <th>Ghose Filter_y</th>\n",
|
| 898 |
+
" <th>Monoisotopic Weight_y</th>\n",
|
| 899 |
+
" <th>MDDR-Like Rule_y</th>\n",
|
| 900 |
+
" <th>Polarizability_y</th>\n",
|
| 901 |
+
" <th>H Bond Acceptor Count_y</th>\n",
|
| 902 |
+
" <th>Physiological Charge_y</th>\n",
|
| 903 |
+
" <th>Rule of Five_y</th>\n",
|
| 904 |
+
" </tr>\n",
|
| 905 |
+
" </thead>\n",
|
| 906 |
+
" <tbody>\n",
|
| 907 |
+
" <tr>\n",
|
| 908 |
+
" <th>0</th>\n",
|
| 909 |
+
" <td>Bivalirudin</td>\n",
|
| 910 |
+
" <td>solid</td>\n",
|
| 911 |
+
" <td>B01AE</td>\n",
|
| 912 |
+
" <td>B01A</td>\n",
|
| 913 |
+
" <td>B01</td>\n",
|
| 914 |
+
" <td>B</td>\n",
|
| 915 |
+
" <td>2180.2853</td>\n",
|
| 916 |
+
" <td>-14.0</td>\n",
|
| 917 |
+
" <td>0.0464</td>\n",
|
| 918 |
+
" <td>-4.7</td>\n",
|
| 919 |
+
" <td>...</td>\n",
|
| 920 |
+
" <td>66.0</td>\n",
|
| 921 |
+
" <td>901.57</td>\n",
|
| 922 |
+
" <td>11.88</td>\n",
|
| 923 |
+
" <td>0.0</td>\n",
|
| 924 |
+
" <td>2178.985813</td>\n",
|
| 925 |
+
" <td>1.0</td>\n",
|
| 926 |
+
" <td>218.54</td>\n",
|
| 927 |
+
" <td>37.0</td>\n",
|
| 928 |
+
" <td>-4.0</td>\n",
|
| 929 |
+
" <td>0.0</td>\n",
|
| 930 |
+
" </tr>\n",
|
| 931 |
+
" <tr>\n",
|
| 932 |
+
" <th>1</th>\n",
|
| 933 |
+
" <td>Bivalirudin</td>\n",
|
| 934 |
+
" <td>solid</td>\n",
|
| 935 |
+
" <td>B01AE</td>\n",
|
| 936 |
+
" <td>B01A</td>\n",
|
| 937 |
+
" <td>B01</td>\n",
|
| 938 |
+
" <td>B</td>\n",
|
| 939 |
+
" <td>2180.2853</td>\n",
|
| 940 |
+
" <td>-14.0</td>\n",
|
| 941 |
+
" <td>0.0464</td>\n",
|
| 942 |
+
" <td>-4.7</td>\n",
|
| 943 |
+
" <td>...</td>\n",
|
| 944 |
+
" <td>32.0</td>\n",
|
| 945 |
+
" <td>429.04</td>\n",
|
| 946 |
+
" <td>11.92</td>\n",
|
| 947 |
+
" <td>0.0</td>\n",
|
| 948 |
+
" <td>1208.645462</td>\n",
|
| 949 |
+
" <td>1.0</td>\n",
|
| 950 |
+
" <td>125.24</td>\n",
|
| 951 |
+
" <td>16.0</td>\n",
|
| 952 |
+
" <td>1.0</td>\n",
|
| 953 |
+
" <td>0.0</td>\n",
|
| 954 |
+
" </tr>\n",
|
| 955 |
+
" <tr>\n",
|
| 956 |
+
" <th>2</th>\n",
|
| 957 |
+
" <td>Bivalirudin</td>\n",
|
| 958 |
+
" <td>solid</td>\n",
|
| 959 |
+
" <td>B01AE</td>\n",
|
| 960 |
+
" <td>B01A</td>\n",
|
| 961 |
+
" <td>B01</td>\n",
|
| 962 |
+
" <td>B</td>\n",
|
| 963 |
+
" <td>2180.2853</td>\n",
|
| 964 |
+
" <td>-14.0</td>\n",
|
| 965 |
+
" <td>0.0464</td>\n",
|
| 966 |
+
" <td>-4.7</td>\n",
|
| 967 |
+
" <td>...</td>\n",
|
| 968 |
+
" <td>33.0</td>\n",
|
| 969 |
+
" <td>495.89</td>\n",
|
| 970 |
+
" <td>10.91</td>\n",
|
| 971 |
+
" <td>0.0</td>\n",
|
| 972 |
+
" <td>1268.641439</td>\n",
|
| 973 |
+
" <td>1.0</td>\n",
|
| 974 |
+
" <td>130.74</td>\n",
|
| 975 |
+
" <td>18.0</td>\n",
|
| 976 |
+
" <td>1.0</td>\n",
|
| 977 |
+
" <td>0.0</td>\n",
|
| 978 |
+
" </tr>\n",
|
| 979 |
+
" <tr>\n",
|
| 980 |
+
" <th>3</th>\n",
|
| 981 |
+
" <td>Bivalirudin</td>\n",
|
| 982 |
+
" <td>solid</td>\n",
|
| 983 |
+
" <td>B01AE</td>\n",
|
| 984 |
+
" <td>B01A</td>\n",
|
| 985 |
+
" <td>B01</td>\n",
|
| 986 |
+
" <td>B</td>\n",
|
| 987 |
+
" <td>2180.2853</td>\n",
|
| 988 |
+
" <td>-14.0</td>\n",
|
| 989 |
+
" <td>0.0464</td>\n",
|
| 990 |
+
" <td>-4.7</td>\n",
|
| 991 |
+
" <td>...</td>\n",
|
| 992 |
+
" <td>50.0</td>\n",
|
| 993 |
+
" <td>519.89</td>\n",
|
| 994 |
+
" <td>NaN</td>\n",
|
| 995 |
+
" <td>0.0</td>\n",
|
| 996 |
+
" <td>1810.033419</td>\n",
|
| 997 |
+
" <td>1.0</td>\n",
|
| 998 |
+
" <td>194.73</td>\n",
|
| 999 |
+
" <td>16.0</td>\n",
|
| 1000 |
+
" <td>0.0</td>\n",
|
| 1001 |
+
" <td>0.0</td>\n",
|
| 1002 |
+
" </tr>\n",
|
| 1003 |
+
" <tr>\n",
|
| 1004 |
+
" <th>4</th>\n",
|
| 1005 |
+
" <td>Bivalirudin</td>\n",
|
| 1006 |
+
" <td>solid</td>\n",
|
| 1007 |
+
" <td>B01AE</td>\n",
|
| 1008 |
+
" <td>B01A</td>\n",
|
| 1009 |
+
" <td>B01</td>\n",
|
| 1010 |
+
" <td>B</td>\n",
|
| 1011 |
+
" <td>2180.2853</td>\n",
|
| 1012 |
+
" <td>-14.0</td>\n",
|
| 1013 |
+
" <td>0.0464</td>\n",
|
| 1014 |
+
" <td>-4.7</td>\n",
|
| 1015 |
+
" <td>...</td>\n",
|
| 1016 |
+
" <td>19.0</td>\n",
|
| 1017 |
+
" <td>435.41</td>\n",
|
| 1018 |
+
" <td>11.77</td>\n",
|
| 1019 |
+
" <td>0.0</td>\n",
|
| 1020 |
+
" <td>1068.426956</td>\n",
|
| 1021 |
+
" <td>1.0</td>\n",
|
| 1022 |
+
" <td>104.78</td>\n",
|
| 1023 |
+
" <td>15.0</td>\n",
|
| 1024 |
+
" <td>1.0</td>\n",
|
| 1025 |
+
" <td>0.0</td>\n",
|
| 1026 |
+
" </tr>\n",
|
| 1027 |
+
" <tr>\n",
|
| 1028 |
+
" <th>5</th>\n",
|
| 1029 |
+
" <td>Bivalirudin</td>\n",
|
| 1030 |
+
" <td>solid</td>\n",
|
| 1031 |
+
" <td>B01AE</td>\n",
|
| 1032 |
+
" <td>B01A</td>\n",
|
| 1033 |
+
" <td>B01</td>\n",
|
| 1034 |
+
" <td>B</td>\n",
|
| 1035 |
+
" <td>2180.2853</td>\n",
|
| 1036 |
+
" <td>-14.0</td>\n",
|
| 1037 |
+
" <td>0.0464</td>\n",
|
| 1038 |
+
" <td>-4.7</td>\n",
|
| 1039 |
+
" <td>...</td>\n",
|
| 1040 |
+
" <td>38.0</td>\n",
|
| 1041 |
+
" <td>495.67</td>\n",
|
| 1042 |
+
" <td>11.79</td>\n",
|
| 1043 |
+
" <td>0.0</td>\n",
|
| 1044 |
+
" <td>1429.669818</td>\n",
|
| 1045 |
+
" <td>1.0</td>\n",
|
| 1046 |
+
" <td>148.93</td>\n",
|
| 1047 |
+
" <td>18.0</td>\n",
|
| 1048 |
+
" <td>1.0</td>\n",
|
| 1049 |
+
" <td>0.0</td>\n",
|
| 1050 |
+
" </tr>\n",
|
| 1051 |
+
" <tr>\n",
|
| 1052 |
+
" <th>6</th>\n",
|
| 1053 |
+
" <td>Bivalirudin</td>\n",
|
| 1054 |
+
" <td>solid</td>\n",
|
| 1055 |
+
" <td>B01AE</td>\n",
|
| 1056 |
+
" <td>B01A</td>\n",
|
| 1057 |
+
" <td>B01</td>\n",
|
| 1058 |
+
" <td>B</td>\n",
|
| 1059 |
+
" <td>2180.2853</td>\n",
|
| 1060 |
+
" <td>-14.0</td>\n",
|
| 1061 |
+
" <td>0.0464</td>\n",
|
| 1062 |
+
" <td>-4.7</td>\n",
|
| 1063 |
+
" <td>...</td>\n",
|
| 1064 |
+
" <td>35.0</td>\n",
|
| 1065 |
+
" <td>702.02</td>\n",
|
| 1066 |
+
" <td>9.59</td>\n",
|
| 1067 |
+
" <td>0.0</td>\n",
|
| 1068 |
+
" <td>1619.710366</td>\n",
|
| 1069 |
+
" <td>1.0</td>\n",
|
| 1070 |
+
" <td>158.96</td>\n",
|
| 1071 |
+
" <td>27.0</td>\n",
|
| 1072 |
+
" <td>-3.0</td>\n",
|
| 1073 |
+
" <td>0.0</td>\n",
|
| 1074 |
+
" </tr>\n",
|
| 1075 |
+
" <tr>\n",
|
| 1076 |
+
" <th>7</th>\n",
|
| 1077 |
+
" <td>Bivalirudin</td>\n",
|
| 1078 |
+
" <td>solid</td>\n",
|
| 1079 |
+
" <td>B01AE</td>\n",
|
| 1080 |
+
" <td>B01A</td>\n",
|
| 1081 |
+
" <td>B01</td>\n",
|
| 1082 |
+
" <td>B</td>\n",
|
| 1083 |
+
" <td>2180.2853</td>\n",
|
| 1084 |
+
" <td>-14.0</td>\n",
|
| 1085 |
+
" <td>0.0464</td>\n",
|
| 1086 |
+
" <td>-4.7</td>\n",
|
| 1087 |
+
" <td>...</td>\n",
|
| 1088 |
+
" <td>38.0</td>\n",
|
| 1089 |
+
" <td>424.98</td>\n",
|
| 1090 |
+
" <td>10.66</td>\n",
|
| 1091 |
+
" <td>0.0</td>\n",
|
| 1092 |
+
" <td>1414.684072</td>\n",
|
| 1093 |
+
" <td>1.0</td>\n",
|
| 1094 |
+
" <td>149.31</td>\n",
|
| 1095 |
+
" <td>16.0</td>\n",
|
| 1096 |
+
" <td>1.0</td>\n",
|
| 1097 |
+
" <td>0.0</td>\n",
|
| 1098 |
+
" </tr>\n",
|
| 1099 |
+
" <tr>\n",
|
| 1100 |
+
" <th>8</th>\n",
|
| 1101 |
+
" <td>Bivalirudin</td>\n",
|
| 1102 |
+
" <td>solid</td>\n",
|
| 1103 |
+
" <td>B01AE</td>\n",
|
| 1104 |
+
" <td>B01A</td>\n",
|
| 1105 |
+
" <td>B01</td>\n",
|
| 1106 |
+
" <td>B</td>\n",
|
| 1107 |
+
" <td>2180.2853</td>\n",
|
| 1108 |
+
" <td>-14.0</td>\n",
|
| 1109 |
+
" <td>0.0464</td>\n",
|
| 1110 |
+
" <td>-4.7</td>\n",
|
| 1111 |
+
" <td>...</td>\n",
|
| 1112 |
+
" <td>4.0</td>\n",
|
| 1113 |
+
" <td>116.95</td>\n",
|
| 1114 |
+
" <td>4.11</td>\n",
|
| 1115 |
+
" <td>0.0</td>\n",
|
| 1116 |
+
" <td>247.024574</td>\n",
|
| 1117 |
+
" <td>0.0</td>\n",
|
| 1118 |
+
" <td>20.90</td>\n",
|
| 1119 |
+
" <td>6.0</td>\n",
|
| 1120 |
+
" <td>-2.0</td>\n",
|
| 1121 |
+
" <td>1.0</td>\n",
|
| 1122 |
+
" </tr>\n",
|
| 1123 |
+
" <tr>\n",
|
| 1124 |
+
" <th>9</th>\n",
|
| 1125 |
+
" <td>Bivalirudin</td>\n",
|
| 1126 |
+
" <td>solid</td>\n",
|
| 1127 |
+
" <td>B01AE</td>\n",
|
| 1128 |
+
" <td>B01A</td>\n",
|
| 1129 |
+
" <td>B01</td>\n",
|
| 1130 |
+
" <td>B</td>\n",
|
| 1131 |
+
" <td>2180.2853</td>\n",
|
| 1132 |
+
" <td>-14.0</td>\n",
|
| 1133 |
+
" <td>0.0464</td>\n",
|
| 1134 |
+
" <td>-4.7</td>\n",
|
| 1135 |
+
" <td>...</td>\n",
|
| 1136 |
+
" <td>27.0</td>\n",
|
| 1137 |
+
" <td>477.85</td>\n",
|
| 1138 |
+
" <td>8.68</td>\n",
|
| 1139 |
+
" <td>0.0</td>\n",
|
| 1140 |
+
" <td>1354.567405</td>\n",
|
| 1141 |
+
" <td>1.0</td>\n",
|
| 1142 |
+
" <td>138.79</td>\n",
|
| 1143 |
+
" <td>18.0</td>\n",
|
| 1144 |
+
" <td>3.0</td>\n",
|
| 1145 |
+
" <td>0.0</td>\n",
|
| 1146 |
+
" </tr>\n",
|
| 1147 |
+
" </tbody>\n",
|
| 1148 |
+
"</table>\n",
|
| 1149 |
+
"<p>10 rows × 50 columns</p>\n",
|
| 1150 |
+
"</div>"
|
| 1151 |
+
],
|
| 1152 |
+
"text/plain": [
|
| 1153 |
+
" name_x state_x level4_x level3_x level2_x level1_x \\\n",
|
| 1154 |
+
"0 Bivalirudin solid B01AE B01A B01 B \n",
|
| 1155 |
+
"1 Bivalirudin solid B01AE B01A B01 B \n",
|
| 1156 |
+
"2 Bivalirudin solid B01AE B01A B01 B \n",
|
| 1157 |
+
"3 Bivalirudin solid B01AE B01A B01 B \n",
|
| 1158 |
+
"4 Bivalirudin solid B01AE B01A B01 B \n",
|
| 1159 |
+
"5 Bivalirudin solid B01AE B01A B01 B \n",
|
| 1160 |
+
"6 Bivalirudin solid B01AE B01A B01 B \n",
|
| 1161 |
+
"7 Bivalirudin solid B01AE B01A B01 B \n",
|
| 1162 |
+
"8 Bivalirudin solid B01AE B01A B01 B \n",
|
| 1163 |
+
"9 Bivalirudin solid B01AE B01A B01 B \n",
|
| 1164 |
+
"\n",
|
| 1165 |
+
" Molecular Weight_x logP_x Water Solubility_x logS_x ... \\\n",
|
| 1166 |
+
"0 2180.2853 -14.0 0.0464 -4.7 ... \n",
|
| 1167 |
+
"1 2180.2853 -14.0 0.0464 -4.7 ... \n",
|
| 1168 |
+
"2 2180.2853 -14.0 0.0464 -4.7 ... \n",
|
| 1169 |
+
"3 2180.2853 -14.0 0.0464 -4.7 ... \n",
|
| 1170 |
+
"4 2180.2853 -14.0 0.0464 -4.7 ... \n",
|
| 1171 |
+
"5 2180.2853 -14.0 0.0464 -4.7 ... \n",
|
| 1172 |
+
"6 2180.2853 -14.0 0.0464 -4.7 ... \n",
|
| 1173 |
+
"7 2180.2853 -14.0 0.0464 -4.7 ... \n",
|
| 1174 |
+
"8 2180.2853 -14.0 0.0464 -4.7 ... \n",
|
| 1175 |
+
"9 2180.2853 -14.0 0.0464 -4.7 ... \n",
|
| 1176 |
+
"\n",
|
| 1177 |
+
" Rotatable Bond Count_y Polar Surface Area (PSA)_y \\\n",
|
| 1178 |
+
"0 66.0 901.57 \n",
|
| 1179 |
+
"1 32.0 429.04 \n",
|
| 1180 |
+
"2 33.0 495.89 \n",
|
| 1181 |
+
"3 50.0 519.89 \n",
|
| 1182 |
+
"4 19.0 435.41 \n",
|
| 1183 |
+
"5 38.0 495.67 \n",
|
| 1184 |
+
"6 35.0 702.02 \n",
|
| 1185 |
+
"7 38.0 424.98 \n",
|
| 1186 |
+
"8 4.0 116.95 \n",
|
| 1187 |
+
"9 27.0 477.85 \n",
|
| 1188 |
+
"\n",
|
| 1189 |
+
" pKa (strongest basic)_y Ghose Filter_y Monoisotopic Weight_y \\\n",
|
| 1190 |
+
"0 11.88 0.0 2178.985813 \n",
|
| 1191 |
+
"1 11.92 0.0 1208.645462 \n",
|
| 1192 |
+
"2 10.91 0.0 1268.641439 \n",
|
| 1193 |
+
"3 NaN 0.0 1810.033419 \n",
|
| 1194 |
+
"4 11.77 0.0 1068.426956 \n",
|
| 1195 |
+
"5 11.79 0.0 1429.669818 \n",
|
| 1196 |
+
"6 9.59 0.0 1619.710366 \n",
|
| 1197 |
+
"7 10.66 0.0 1414.684072 \n",
|
| 1198 |
+
"8 4.11 0.0 247.024574 \n",
|
| 1199 |
+
"9 8.68 0.0 1354.567405 \n",
|
| 1200 |
+
"\n",
|
| 1201 |
+
" MDDR-Like Rule_y Polarizability_y H Bond Acceptor Count_y \\\n",
|
| 1202 |
+
"0 1.0 218.54 37.0 \n",
|
| 1203 |
+
"1 1.0 125.24 16.0 \n",
|
| 1204 |
+
"2 1.0 130.74 18.0 \n",
|
| 1205 |
+
"3 1.0 194.73 16.0 \n",
|
| 1206 |
+
"4 1.0 104.78 15.0 \n",
|
| 1207 |
+
"5 1.0 148.93 18.0 \n",
|
| 1208 |
+
"6 1.0 158.96 27.0 \n",
|
| 1209 |
+
"7 1.0 149.31 16.0 \n",
|
| 1210 |
+
"8 0.0 20.90 6.0 \n",
|
| 1211 |
+
"9 1.0 138.79 18.0 \n",
|
| 1212 |
+
"\n",
|
| 1213 |
+
" Physiological Charge_y Rule of Five_y \n",
|
| 1214 |
+
"0 -4.0 0.0 \n",
|
| 1215 |
+
"1 1.0 0.0 \n",
|
| 1216 |
+
"2 1.0 0.0 \n",
|
| 1217 |
+
"3 0.0 0.0 \n",
|
| 1218 |
+
"4 1.0 0.0 \n",
|
| 1219 |
+
"5 1.0 0.0 \n",
|
| 1220 |
+
"6 -3.0 0.0 \n",
|
| 1221 |
+
"7 1.0 0.0 \n",
|
| 1222 |
+
"8 -2.0 1.0 \n",
|
| 1223 |
+
"9 3.0 0.0 \n",
|
| 1224 |
+
"\n",
|
| 1225 |
+
"[10 rows x 50 columns]"
|
| 1226 |
+
]
|
| 1227 |
+
},
|
| 1228 |
+
"execution_count": 1,
|
| 1229 |
+
"metadata": {},
|
| 1230 |
+
"output_type": "execute_result"
|
| 1231 |
+
}
|
| 1232 |
+
],
|
| 1233 |
+
"source": [
|
| 1234 |
+
"import pandas as pd\n",
|
| 1235 |
+
"\n",
|
| 1236 |
+
"catboost_df = pd.read_csv('datasets/drug_pairs.csv', index_col=0)\n",
|
| 1237 |
+
"catboost_df.head(10)"
|
| 1238 |
+
]
|
| 1239 |
+
},
|
| 1240 |
+
{
|
| 1241 |
+
"cell_type": "code",
|
| 1242 |
+
"execution_count": 3,
|
| 1243 |
+
"metadata": {},
|
| 1244 |
+
"outputs": [],
|
| 1245 |
+
"source": [
|
| 1246 |
+
"import json\n",
|
| 1247 |
+
"\n",
|
| 1248 |
+
"with open('interactions.json', 'r') as f:\n",
|
| 1249 |
+
" interactions = json.load(f)\n"
|
| 1250 |
+
]
|
| 1251 |
+
},
|
| 1252 |
+
{
|
| 1253 |
+
"cell_type": "code",
|
| 1254 |
+
"execution_count": 37,
|
| 1255 |
+
"metadata": {},
|
| 1256 |
+
"outputs": [],
|
| 1257 |
+
"source": [
|
| 1258 |
+
"# Create a new column in the dataframe to store the interaction label\n",
|
| 1259 |
+
"# For each drug pair, check if the interaction is present in the interactions dictionary\n",
|
| 1260 |
+
"# If yes, assign 1, else 0\n",
|
| 1261 |
+
"catboost_df['interaction'] = catboost_df.apply(lambda x: 1 if x['name_y'] in interactions.get(x['name_x'], list()) else 0, axis=1)"
|
| 1262 |
+
]
|
| 1263 |
+
},
|
| 1264 |
+
{
|
| 1265 |
+
"cell_type": "code",
|
| 1266 |
+
"execution_count": 27,
|
| 1267 |
+
"metadata": {},
|
| 1268 |
+
"outputs": [
|
| 1269 |
+
{
|
| 1270 |
+
"data": {
|
| 1271 |
+
"text/html": [
|
| 1272 |
+
"<div>\n",
|
| 1273 |
+
"<style scoped>\n",
|
| 1274 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
| 1275 |
+
" vertical-align: middle;\n",
|
| 1276 |
+
" }\n",
|
| 1277 |
+
"\n",
|
| 1278 |
+
" .dataframe tbody tr th {\n",
|
| 1279 |
+
" vertical-align: top;\n",
|
| 1280 |
+
" }\n",
|
| 1281 |
+
"\n",
|
| 1282 |
+
" .dataframe thead th {\n",
|
| 1283 |
+
" text-align: right;\n",
|
| 1284 |
+
" }\n",
|
| 1285 |
+
"</style>\n",
|
| 1286 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
| 1287 |
+
" <thead>\n",
|
| 1288 |
+
" <tr style=\"text-align: right;\">\n",
|
| 1289 |
+
" <th></th>\n",
|
| 1290 |
+
" <th>drug1</th>\n",
|
| 1291 |
+
" <th>drug2</th>\n",
|
| 1292 |
+
" <th>interaction</th>\n",
|
| 1293 |
+
" </tr>\n",
|
| 1294 |
+
" </thead>\n",
|
| 1295 |
+
" <tbody>\n",
|
| 1296 |
+
" <tr>\n",
|
| 1297 |
+
" <th>0</th>\n",
|
| 1298 |
+
" <td>hey1</td>\n",
|
| 1299 |
+
" <td>hello1</td>\n",
|
| 1300 |
+
" <td>1</td>\n",
|
| 1301 |
+
" </tr>\n",
|
| 1302 |
+
" <tr>\n",
|
| 1303 |
+
" <th>1</th>\n",
|
| 1304 |
+
" <td>hey2</td>\n",
|
| 1305 |
+
" <td>hello2</td>\n",
|
| 1306 |
+
" <td>1</td>\n",
|
| 1307 |
+
" </tr>\n",
|
| 1308 |
+
" <tr>\n",
|
| 1309 |
+
" <th>2</th>\n",
|
| 1310 |
+
" <td>hey3</td>\n",
|
| 1311 |
+
" <td>hello3</td>\n",
|
| 1312 |
+
" <td>0</td>\n",
|
| 1313 |
+
" </tr>\n",
|
| 1314 |
+
" <tr>\n",
|
| 1315 |
+
" <th>3</th>\n",
|
| 1316 |
+
" <td>hey4</td>\n",
|
| 1317 |
+
" <td>hello4</td>\n",
|
| 1318 |
+
" <td>0</td>\n",
|
| 1319 |
+
" </tr>\n",
|
| 1320 |
+
" </tbody>\n",
|
| 1321 |
+
"</table>\n",
|
| 1322 |
+
"</div>"
|
| 1323 |
+
],
|
| 1324 |
+
"text/plain": [
|
| 1325 |
+
" drug1 drug2 interaction\n",
|
| 1326 |
+
"0 hey1 hello1 1\n",
|
| 1327 |
+
"1 hey2 hello2 1\n",
|
| 1328 |
+
"2 hey3 hello3 0\n",
|
| 1329 |
+
"3 hey4 hello4 0"
|
| 1330 |
+
]
|
| 1331 |
+
},
|
| 1332 |
+
"execution_count": 27,
|
| 1333 |
+
"metadata": {},
|
| 1334 |
+
"output_type": "execute_result"
|
| 1335 |
+
}
|
| 1336 |
+
],
|
| 1337 |
+
"source": [
|
| 1338 |
+
"'''\n",
|
| 1339 |
+
"Dummy example to show how to use the interactions dictionary\n",
|
| 1340 |
+
"'''\n",
|
| 1341 |
+
"\n",
|
| 1342 |
+
"dummy = pd.DataFrame({'drug1': ['hey1', 'hey2', 'hey3', 'hey4'], 'drug2': ['hello1', 'hello2', 'hello3', 'hello4']}, columns=['drug1', 'drug2'])\n",
|
| 1343 |
+
"i = {\n",
|
| 1344 |
+
" 'hey1': ['hello1'],\n",
|
| 1345 |
+
" 'hey2': ['hello1', 'hello2'],\n",
|
| 1346 |
+
" 'hey3': ['hello4'],\n",
|
| 1347 |
+
" 'hey4': [],\n",
|
| 1348 |
+
" 'hey5': ['hello1', 'hello2', 'hello3', 'hello4']\n",
|
| 1349 |
+
"}\n",
|
| 1350 |
+
"dummy['interaction'] = dummy.apply(lambda x: 1 if x['drug2'] in i.get(x['drug1'], list()) else 0, axis=1)\n",
|
| 1351 |
+
"dummy.head()"
|
| 1352 |
+
]
|
| 1353 |
+
},
|
| 1354 |
+
{
|
| 1355 |
+
"cell_type": "code",
|
| 1356 |
+
"execution_count": 40,
|
| 1357 |
+
"metadata": {},
|
| 1358 |
+
"outputs": [
|
| 1359 |
+
{
|
| 1360 |
+
"data": {
|
| 1361 |
+
"text/plain": [
|
| 1362 |
+
"<AxesSubplot:>"
|
| 1363 |
+
]
|
| 1364 |
+
},
|
| 1365 |
+
"execution_count": 40,
|
| 1366 |
+
"metadata": {},
|
| 1367 |
+
"output_type": "execute_result"
|
| 1368 |
+
},
|
| 1369 |
+
{
|
| 1370 |
+
"data": {
|
| 1371 |
+
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAhYAAAGsCAYAAACB/u5dAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/MnkTPAAAACXBIWXMAAA9hAAAPYQGoP6dpAAAcw0lEQVR4nO3de5CVdf3A8c+yLAfRXRU3FHLxQoMWgjfS8PITTSJ1SGsqJ4rIvFSiUzJdNDOWvJGjjo2ZmZrkTEjZhJUiQhYymk5KMOElFfGaoqK5C2wdDrvP749mN3EBOev3POtZX68Z/tiHZ5/z4cPieXvO2T01WZZlAQCQQL/eHgAA6DuEBQCQjLAAAJIRFgBAMsICAEhGWAAAyQgLACAZYQEAJCMsAIBkhAUAkEyvhcWSJUti0qRJMWzYsKipqYnbbrut7GtkWRaXX355jBw5MgqFQrz//e+Piy++OP2wAMA26d9bN7x+/frYf//948tf/nJ86lOf6tE1vv71r8fChQvj8ssvj9GjR8frr78er7/+euJJAYBtVfNueBOympqamDdvXpx00kldx4rFYpx//vlxyy23xBtvvBH77bdf/PCHP4zx48dHRMRjjz0WY8aMiYcffjj22Wef3hkcANjEu/Y1FmeddVbcf//9MXfu3Pj73/8en/nMZ+LjH/94PPnkkxER8Yc//CH23nvvuP3222OvvfaKPffcM0477TSPWABAL3pXhsVzzz0XN910U9x6661x5JFHxogRI+Kb3/xmHHHEEXHTTTdFRMSqVavi2WefjVtvvTVuvvnmmD17dixdujQ+/elP9/L0APDe1WuvsdiaFStWRHt7e4wcOXKT48ViMXbZZZeIiOjo6IhisRg333xz13k33nhjHHzwwfH44497egQAesG7MizWrVsXtbW1sXTp0qitrd3k93bYYYeIiBg6dGj0799/k/j44Ac/GBH/fcRDWABA/t6VYXHggQdGe3t7vPLKK3HkkUdu9pzDDz88Nm7cGE899VSMGDEiIiKeeOKJiIjYY489cpsVAPifXvuukHXr1sXKlSsj4r8hceWVV8bRRx8dgwcPjuHDh8cXvvCFuO++++KKK66IAw88MF599dW4++67Y8yYMXHCCSdER0dHfPjDH44ddtghrrrqqujo6Ihp06ZFQ0NDLFy4sDf+SADwntdrYbF48eI4+uijux2fOnVqzJ49O0qlUlx00UVx8803xz//+c9obGyMj3zkIzFz5swYPXp0RES8+OKLcfbZZ8fChQtj++23j+OOOy6uuOKKGDx4cN5/HAAg3iU/xwIA6Bveld9uCgBUJ2EBACST+3eFdHR0xIsvvhj19fVRU1OT980DAD2QZVmsXbs2hg0bFv36bflxidzD4sUXX4ympqa8bxYASOD555+P3XfffYu/n3tY1NfXR8R/B2toaEh23VKpFAsXLoyPfexjUVdXl+y6bMqe82PX+bDnfNhzPiq559bW1mhqauq6H9+S3MOi8+mPhoaG5GExaNCgaGho8EVbQfacH7vOhz3nw57zkcee3+5lDF68CQAkIywAgGSEBQCQjLAAAJIRFgBAMsICAEhGWAAAyQgLACAZYQEAJCMsAIBkhAUAkIywAACSERYAQDLCAgBIJve3Ta+0/ZrvimL71t/S9d3kmVkn9PYIAJCMRywAgGSEBQCQjLAAAJIRFgBAMsICAEhGWAAAyQgLACAZYQEAJCMsAIBkhAUAkIywAACSERYAQDLCAgBIRlgAAMkICwAgGWEBACQjLACAZIQFAJCMsAAAkhEWAEAywgIASEZYAADJCAsAIBlhAQAkIywAgGSEBQCQjLAAAJIRFgBAMsICAEhGWAAAyQgLACCZssKiubk5ampqNvm17777Vmo2AKDK9C/3E0aNGhV//OMf/3eB/mVfAgDoo8qugv79+8duu+1WiVkAgCpXdlg8+eSTMWzYsBg4cGCMGzcuLr300hg+fPgWzy8Wi1EsFrs+bm1tjYiIUqkUpVKpByNvXue1Cv2yZNfMQ8od5KFz3mqbuxrZdT7sOR/2nI9K7nlbr1mTZdk23xPfeeedsW7duthnn33ipZdeipkzZ8Y///nPePjhh6O+vn6zn9Pc3BwzZ87sdnzOnDkxaNCgbb1pAKAXtbW1xeTJk6OlpSUaGhq2eF5ZYfFWb7zxRuyxxx5x5ZVXxqmnnrrZczb3iEVTU1OsWbNmq4OVq1QqxaJFi+KCh/pFsaMm2XUr7eHmib09Qlk69zxhwoSoq6vr7XH6NLvOhz3nw57zUck9t7a2RmNj49uGxTt65eVOO+0UI0eOjJUrV27xnEKhEIVCodvxurq6inxxFTtqothePWFRrf/AKvX3R3d2nQ97zoc956MSe97W672jn2Oxbt26eOqpp2Lo0KHv5DIAQB9RVlh885vfjHvuuSeeeeaZ+Mtf/hKf/OQno7a2Nj73uc9Vaj4AoIqU9VTICy+8EJ/73Ofitddei/e9731xxBFHxAMPPBDve9/7KjUfAFBFygqLuXPnVmoOAKAP8F4hAEAywgIASEZYAADJCAsAIBlhAQAkIywAgGSEBQCQjLAAAJIRFgBAMsICAEhGWAAAyQgLACAZYQEAJCMsAIBkhAUAkIywAACSERYAQDLCAgBIRlgAAMkICwAgGWEBACQjLACAZIQFAJCMsAAAkhEWAEAywgIASEZYAADJCAsAIBlhAQAkIywAgGSEBQCQjLAAAJIRFgBAMsICAEhGWAAAyQgLACAZYQEAJCMsAIBkhAUAkIywAACSERYAQDLCAgBIRlgAAMkICwAgGWEBACQjLACAZIQFAJCMsAAAkhEWAEAywgIASEZYAADJCAsAIJl3FBazZs2Kmpqa+MY3vpFoHACgmvU4LB588MG47rrrYsyYMSnnAQCqWI/CYt26dfH5z38+rr/++th5551TzwQAVKn+PfmkadOmxQknnBDHHntsXHTRRVs9t1gsRrFY7Pq4tbU1IiJKpVKUSqWe3PxmdV6r0C9Lds08pNxBHjrnrba5q5Fd58Oe82HP+ajknrf1mjVZlpV1Tzx37ty4+OKL48EHH4yBAwfG+PHj44ADDoirrrpqs+c3NzfHzJkzux2fM2dODBo0qJybBgB6SVtbW0yePDlaWlqioaFhi+eVFRbPP/98jB07NhYtWtT12oq3C4vNPWLR1NQUa9as2epg5SqVSrFo0aK44KF+UeyoSXbdSnu4eWJvj1CWzj1PmDAh6urqenucPs2u82HP+bDnfFRyz62trdHY2Pi2YVHWUyFLly6NV155JQ466KCuY+3t7bFkyZL48Y9/HMViMWprazf5nEKhEIVCodu16urqKvLFVeyoiWJ79YRFtf4Dq9TfH93ZdT7sOR/2nI9K7Hlbr1dWWHz0ox+NFStWbHLslFNOiX333Te+853vdIsKAOC9paywqK+vj/3222+TY9tvv33ssssu3Y4DAO89fvImAJBMj77d9M0WL16cYAwAoC/wiAUAkIywAACSERYAQDLCAgBIRlgAAMkICwAgGWEBACQjLACAZIQFAJCMsAAAkhEWAEAywgIASEZYAADJCAsAIBlhAQAkIywAgGSEBQCQjLAAAJIRFgBAMsICAEhGWAAAyQgLACAZYQEAJCMsAIBkhAUAkIywAACSERYAQDLCAgBIRlgAAMkICwAgGWEBACQjLACAZIQFAJCMsAAAkhEWAEAywgIASEZYAADJCAsAIBlhAQAkIywAgGSEBQCQjLAAAJIRFgBAMsICAEhGWAAAyQgLACAZYQEAJCMsAIBkhAUAkIywAACSERYAQDLCAgBIRlgAAMmUFRbXXnttjBkzJhoaGqKhoSHGjRsXd955Z6VmAwCqTFlhsfvuu8esWbNi6dKl8dBDD8UxxxwTJ554YjzyyCOVmg8AqCL9yzl50qRJm3x88cUXx7XXXhsPPPBAjBo1KulgAED1KSss3qy9vT1uvfXWWL9+fYwbN26L5xWLxSgWi10ft7a2RkREqVSKUqnU05vvpvNahX5ZsmvmIeUO8tA5b7XNXY3sOh/2nA97zkcl97yt16zJsqyse+IVK1bEuHHj4j//+U/ssMMOMWfOnDj++OO3eH5zc3PMnDmz2/E5c+bEoEGDyrlpAKCXtLW1xeTJk6OlpSUaGhq2eF7ZYbFhw4Z47rnnoqWlJX7zm9/EDTfcEPfcc0986EMf2uz5m3vEoqmpKdasWbPVwcpVKpVi0aJFccFD/aLYUZPsupX2cPPE3h6hLJ17njBhQtTV1fX2OH2aXefDnvNhz/mo5J5bW1ujsbHxbcOi7KdCBgwYEB/4wAciIuLggw+OBx98MH70ox/Fddddt9nzC4VCFAqFbsfr6uoq8sVV7KiJYnv1hEW1/gOr1N8f3dl1Puw5H/acj0rseVuv945/jkVHR8cmj0gAAO9dZT1icd5558Vxxx0Xw4cPj7Vr18acOXNi8eLFcdddd1VqPgCgipQVFq+88kp88YtfjJdeeil23HHHGDNmTNx1110xYcKESs0HAFSRssLixhtvrNQcAEAf4L1CAIBkhAUAkIywAACSERYAQDLCAgBIRlgAAMkICwAgGWEBACQjLACAZIQFAJCMsAAAkhEWAEAywgIASEZYAADJCAsAIBlhAQAkIywAgGSEBQCQjLAAAJIRFgBAMsICAEhGWAAAyQgLACAZYQEAJCMsAIBkhAUAkIywAACSERYAQDLCAgBIRlgAAMkICwAgGWEBACQjLACAZIQFAJCMsAAAkhEWAEAywgIASEZYAADJCAsAIBlhAQAkIywAgGSEBQCQjLAAAJIRFgBAMsICAEhGWAAAyQgLACAZYQEAJCMsAIBkhAUAkIywAACSERYAQDJlhcWll14aH/7wh6O+vj6GDBkSJ510Ujz++OOVmg0AqDJlhcU999wT06ZNiwceeCAWLVoUpVIpPvaxj8X69esrNR8AUEX6l3PyggULNvl49uzZMWTIkFi6dGn83//9X9LBAIDqU1ZYvFVLS0tERAwePHiL5xSLxSgWi10ft7a2RkREqVSKUqn0Tm5+E53XKvTLkl0zDyl3kIfOeatt7mpk1/mw53zYcz4quedtvWZNlmU9uifu6OiIT3ziE/HGG2/Evffeu8XzmpubY+bMmd2Oz5kzJwYNGtSTmwYActbW1haTJ0+OlpaWaGho2OJ5PQ6Lr33ta3HnnXfGvffeG7vvvvsWz9vcIxZNTU2xZs2arQ5WrlKpFIsWLYoLHuoXxY6aZNettIebJ/b2CGXp3POECROirq6ut8fp0+w6H/acD3vORyX33NraGo2NjW8bFj16KuSss86K22+/PZYsWbLVqIiIKBQKUSgUuh2vq6uryBdXsaMmiu3VExbV+g+sUn9/dGfX+bDnfNhzPiqx5229XllhkWVZnH322TFv3rxYvHhx7LXXXj0aDgDom8oKi2nTpsWcOXPid7/7XdTX18fq1asjImLHHXeM7bbbriIDAgDVo6yfY3HttddGS0tLjB8/PoYOHdr161e/+lWl5gMAqkjZT4UAAGyJ9woBAJIRFgBAMsICAEhGWAAAyQgLACAZYQEAJCMsAIBkhAUAkIywAACSERYAQDLCAgBIRlgAAMkICwAgGWEBACQjLACAZIQFAJCMsAAAkhEWAEAywgIASEZYAADJCAsAIBlhAQAkIywAgGSEBQCQjLAAAJIRFgBAMsICAEhGWAAAyQgLACAZYQEAJCMsAIBkhAUAkIywAACSERYAQDL9e3sAAHi32vPcO3p7hLIUarO47JDencEjFgBAMsICAEhGWAAAyQgLACAZYQEAJCMsAIBkhAUAkIywAACSERYAQDLCAgBIRlgAAMkICwAgGWEBACQjLACAZIQFAJCMsAAAkhEWAEAyZYfFkiVLYtKkSTFs2LCoqamJ2267rQJjAQDVqOywWL9+fey///5xzTXXVGIeAKCK9S/3E4477rg47rjjKjELAFDlyg6LchWLxSgWi10ft7a2RkREqVSKUqmU7HY6r1XolyW7Zh5S7iAPnfNW29zVyK7zYc/5qNY9F2qr6z6l8z6wEnve1mvWZFnW463V1NTEvHnz4qSTTtriOc3NzTFz5sxux+fMmRODBg3q6U0DADlqa2uLyZMnR0tLSzQ0NGzxvIqHxeYesWhqaoo1a9ZsdbBylUqlWLRoUVzwUL8odtQku26lPdw8sbdHKEvnnidMmBB1dXW9PU6fZtf5sOd8VOue92u+q7dHKEuhXxYXju2oyJ5bW1ujsbHxbcOi4k+FFAqFKBQK3Y7X1dVV5Iur2FETxfbqCYtq+gf2ZpX6+6M7u86HPeej2vZcTfcnb1aJPW/r9fwcCwAgmbIfsVi3bl2sXLmy6+Onn346li9fHoMHD47hw4cnHQ4AqC5lh8VDDz0URx99dNfH06dPj4iIqVOnxuzZs5MNBgBUn7LDYvz48fEOXu8JAPRhXmMBACQjLACAZIQFAJCMsAAAkhEWAEAywgIASEZYAADJCAsAIBlhAQAkIywAgGSEBQCQjLAAAJIRFgBAMsICAEhGWAAAyQgLACAZYQEAJCMsAIBkhAUAkIywAACSERYAQDLCAgBIRlgAAMkICwAgGWEBACQjLACAZIQFAJCMsAAAkhEWAEAywgIASEZYAADJCAsAIBlhAQAkIywAgGSEBQCQjLAAAJIRFgBAMsICAEhGWAAAyQgLACAZYQEAJCMsAIBkhAUAkIywAACSERYAQDLCAgBIRlgAAMkICwAgGWEBACQjLACAZIQFAJCMsAAAkulRWFxzzTWx5557xsCBA+PQQw+Nv/71r6nnAgCqUNlh8atf/SqmT58eM2bMiL/97W+x//77x8SJE+OVV16pxHwAQBUpOyyuvPLKOP300+OUU06JD33oQ/HTn/40Bg0aFD//+c8rMR8AUEX6l3Pyhg0bYunSpXHeeed1HevXr18ce+yxcf/992/2c4rFYhSLxa6PW1paIiLi9ddfj1Kp1JOZN6tUKkVbW1v0L/WL9o6aZNettNdee623RyhL555fe+21qKur6+1x+jS7zoc956Na99x/4/reHqEs/TuyaGvrqMie165dGxERWZZtfYZyLrpmzZpob2+PXXfddZPju+66a/zjH//Y7OdceumlMXPmzG7H99prr3Juus9qvKK3JwCgL5lc4euvXbs2dtxxxy3+fllh0RPnnXdeTJ8+vevjjo6OeP3112OXXXaJmpp0jyy0trZGU1NTPP/889HQ0JDsumzKnvNj1/mw53zYcz4quecsy2Lt2rUxbNiwrZ5XVlg0NjZGbW1tvPzyy5scf/nll2O33Xbb7OcUCoUoFAqbHNtpp53KudmyNDQ0+KLNgT3nx67zYc/5sOd8VGrPW3ukolNZL94cMGBAHHzwwXH33Xd3Hevo6Ii77747xo0bV/6EAECfUvZTIdOnT4+pU6fG2LFj45BDDomrrroq1q9fH6ecckol5gMAqkjZYXHyySfHq6++Gt///vdj9erVccABB8SCBQu6vaAzb4VCIWbMmNHtaRfSsuf82HU+7Dkf9pyPd8Oea7K3+74RAIBt5L1CAIBkhAUAkIywAACSERYAQDJVFRblvl37rbfeGvvuu28MHDgwRo8eHfPnz89p0upWzp6vv/76OPLII2PnnXeOnXfeOY499ti3/Xvhv8r9eu40d+7cqKmpiZNOOqmyA/Yh5e76jTfeiGnTpsXQoUOjUCjEyJEj/fdjG5S756uuuir22Wef2G677aKpqSnOOeec+M9//pPTtNVpyZIlMWnSpBg2bFjU1NTEbbfd9rafs3jx4jjooIOiUCjEBz7wgZg9e3Zlh8yqxNy5c7MBAwZkP//5z7NHHnkkO/3007Oddtope/nllzd7/n333ZfV1tZml112Wfboo49m3/ve97K6urpsxYoVOU9eXcrd8+TJk7NrrrkmW7ZsWfbYY49lX/rSl7Idd9wxe+GFF3KevLqUu+dOTz/9dPb+978/O/LII7MTTzwxn2GrXLm7LhaL2dixY7Pjjz8+u/fee7Onn346W7x4cbZ8+fKcJ68u5e75l7/8ZVYoFLJf/vKX2dNPP53ddddd2dChQ7Nzzjkn58mry/z587Pzzz8/++1vf5tFRDZv3rytnr9q1aps0KBB2fTp07NHH300u/rqq7Pa2tpswYIFFZuxasLikEMOyaZNm9b1cXt7ezZs2LDs0ksv3ez5n/3sZ7MTTjhhk2OHHnpo9pWvfKWic1a7cvf8Vhs3bszq6+uzX/ziF5UasU/oyZ43btyYHXbYYdkNN9yQTZ06VVhso3J3fe2112Z77713tmHDhrxG7BPK3fO0adOyY445ZpNj06dPzw4//PCKztmXbEtYfPvb385GjRq1ybGTTz45mzhxYsXmqoqnQjrfrv3YY4/tOvZ2b9d+//33b3J+RMTEiRO3eD492/NbtbW1RalUisGDB1dqzKrX0z3/4Ac/iCFDhsSpp56ax5h9Qk92/fvf/z7GjRsX06ZNi1133TX222+/uOSSS6K9vT2vsatOT/Z82GGHxdKlS7ueLlm1alXMnz8/jj/++Fxmfq/ojfvCir+7aQo9ebv21atXb/b81atXV2zOateTPb/Vd77znRg2bFi3L2T+pyd7vvfee+PGG2+M5cuX5zBh39GTXa9atSr+9Kc/xec///mYP39+rFy5Ms4888wolUoxY8aMPMauOj3Z8+TJk2PNmjVxxBFHRJZlsXHjxvjqV78a3/3ud/MY+T1jS/eFra2t8e9//zu222675LdZFY9YUB1mzZoVc+fOjXnz5sXAgQN7e5w+Y+3atTFlypS4/vrro7GxsbfH6fM6OjpiyJAh8bOf/SwOPvjgOPnkk+P888+Pn/70p709Wp+yePHiuOSSS+InP/lJ/O1vf4vf/va3cccdd8SFF17Y26PxDlXFIxY9ebv23Xbbrazz6dmeO11++eUxa9as+OMf/xhjxoyp5JhVr9w9P/XUU/HMM8/EpEmTuo51dHRERET//v3j8ccfjxEjRlR26CrVk6/poUOHRl1dXdTW1nYd++AHPxirV6+ODRs2xIABAyo6czXqyZ4vuOCCmDJlSpx22mkRETF69OhYv359nHHGGXH++edHv37+vzeFLd0XNjQ0VOTRiogqecSiJ2/XPm7cuE3Oj4hYtGiRt3ffip7sOSLisssuiwsvvDAWLFgQY8eOzWPUqlbunvfdd99YsWJFLF++vOvXJz7xiTj66KNj+fLl0dTUlOf4VaUnX9OHH354rFy5siveIiKeeOKJGDp0qKjYgp7sua2trVs8dMZc5i2skumV+8KKvSw0sblz52aFQiGbPXt29uijj2ZnnHFGttNOO2WrV6/OsizLpkyZkp177rld5993331Z//79s8svvzx77LHHshkzZvh2021Q7p5nzZqVDRgwIPvNb36TvfTSS12/1q5d21t/hKpQ7p7fyneFbLtyd/3cc89l9fX12VlnnZU9/vjj2e23354NGTIku+iii3rrj1AVyt3zjBkzsvr6+uyWW27JVq1alS1cuDAbMWJE9tnPfra3/ghVYe3atdmyZcuyZcuWZRGRXXnlldmyZcuyZ599NsuyLDv33HOzKVOmdJ3f+e2m3/rWt7LHHnssu+aaa3y76ZtdffXV2fDhw7MBAwZkhxxySPbAAw90/d5RRx2VTZ06dZPzf/3rX2cjR47MBgwYkI0aNSq74447cp64OpWz5z322COLiG6/ZsyYkf/gVabcr+c3ExblKXfXf/nLX7JDDz00KxQK2d57751dfPHF2caNG3OeuvqUs+dSqZQ1NzdnI0aMyAYOHJg1NTVlZ555Zvavf/0r/8GryJ///OfN/je3c7dTp07NjjrqqG6fc8ABB2QDBgzI9t577+ymm26q6IzeNh0ASKYqXmMBAFQHYQEAJCMsAIBkhAUAkIywAACSERYAQDLCAgBIRlgAAMkICwAgGWEBACQjLACAZIQFAJDM/wPtyEe9ddUmcAAAAABJRU5ErkJggg==",
|
| 1372 |
+
"text/plain": [
|
| 1373 |
+
"<Figure size 640x480 with 1 Axes>"
|
| 1374 |
+
]
|
| 1375 |
+
},
|
| 1376 |
+
"metadata": {},
|
| 1377 |
+
"output_type": "display_data"
|
| 1378 |
+
}
|
| 1379 |
+
],
|
| 1380 |
+
"source": [
|
| 1381 |
+
"catboost_df[\"interaction\"].hist()\n"
|
| 1382 |
+
]
|
| 1383 |
+
},
|
| 1384 |
+
{
|
| 1385 |
+
"cell_type": "code",
|
| 1386 |
+
"execution_count": 45,
|
| 1387 |
+
"metadata": {},
|
| 1388 |
+
"outputs": [
|
| 1389 |
+
{
|
| 1390 |
+
"data": {
|
| 1391 |
+
"text/plain": [
|
| 1392 |
+
"0 5565768\n",
|
| 1393 |
+
"1 1351132\n",
|
| 1394 |
+
"Name: interaction, dtype: int64"
|
| 1395 |
+
]
|
| 1396 |
+
},
|
| 1397 |
+
"execution_count": 45,
|
| 1398 |
+
"metadata": {},
|
| 1399 |
+
"output_type": "execute_result"
|
| 1400 |
+
}
|
| 1401 |
+
],
|
| 1402 |
+
"source": [
|
| 1403 |
+
"catboost_df['interaction'].value_counts()"
|
| 1404 |
+
]
|
| 1405 |
+
},
|
| 1406 |
+
{
|
| 1407 |
+
"cell_type": "code",
|
| 1408 |
+
"execution_count": 46,
|
| 1409 |
+
"metadata": {},
|
| 1410 |
+
"outputs": [],
|
| 1411 |
+
"source": [
|
| 1412 |
+
"catboost_df.to_csv('datasets/catboost_df.csv')"
|
| 1413 |
+
]
|
| 1414 |
+
},
|
| 1415 |
+
{
|
| 1416 |
+
"cell_type": "code",
|
| 1417 |
+
"execution_count": 49,
|
| 1418 |
+
"metadata": {},
|
| 1419 |
+
"outputs": [
|
| 1420 |
+
{
|
| 1421 |
+
"data": {
|
| 1422 |
+
"text/html": [
|
| 1423 |
+
"<div>\n",
|
| 1424 |
+
"<style scoped>\n",
|
| 1425 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
| 1426 |
+
" vertical-align: middle;\n",
|
| 1427 |
+
" }\n",
|
| 1428 |
+
"\n",
|
| 1429 |
+
" .dataframe tbody tr th {\n",
|
| 1430 |
+
" vertical-align: top;\n",
|
| 1431 |
+
" }\n",
|
| 1432 |
+
"\n",
|
| 1433 |
+
" .dataframe thead th {\n",
|
| 1434 |
+
" text-align: right;\n",
|
| 1435 |
+
" }\n",
|
| 1436 |
+
"</style>\n",
|
| 1437 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
| 1438 |
+
" <thead>\n",
|
| 1439 |
+
" <tr style=\"text-align: right;\">\n",
|
| 1440 |
+
" <th></th>\n",
|
| 1441 |
+
" <th>state_x</th>\n",
|
| 1442 |
+
" <th>level4_x</th>\n",
|
| 1443 |
+
" <th>level3_x</th>\n",
|
| 1444 |
+
" <th>level2_x</th>\n",
|
| 1445 |
+
" <th>level1_x</th>\n",
|
| 1446 |
+
" <th>Molecular Weight_x</th>\n",
|
| 1447 |
+
" <th>logP_x</th>\n",
|
| 1448 |
+
" <th>Water Solubility_x</th>\n",
|
| 1449 |
+
" <th>logS_x</th>\n",
|
| 1450 |
+
" <th>Bioavailability_x</th>\n",
|
| 1451 |
+
" <th>...</th>\n",
|
| 1452 |
+
" <th>Polar Surface Area (PSA)_y</th>\n",
|
| 1453 |
+
" <th>pKa (strongest basic)_y</th>\n",
|
| 1454 |
+
" <th>Ghose Filter_y</th>\n",
|
| 1455 |
+
" <th>Monoisotopic Weight_y</th>\n",
|
| 1456 |
+
" <th>MDDR-Like Rule_y</th>\n",
|
| 1457 |
+
" <th>Polarizability_y</th>\n",
|
| 1458 |
+
" <th>H Bond Acceptor Count_y</th>\n",
|
| 1459 |
+
" <th>Physiological Charge_y</th>\n",
|
| 1460 |
+
" <th>Rule of Five_y</th>\n",
|
| 1461 |
+
" <th>interaction</th>\n",
|
| 1462 |
+
" </tr>\n",
|
| 1463 |
+
" </thead>\n",
|
| 1464 |
+
" <tbody>\n",
|
| 1465 |
+
" <tr>\n",
|
| 1466 |
+
" <th>0</th>\n",
|
| 1467 |
+
" <td>solid</td>\n",
|
| 1468 |
+
" <td>B01AE</td>\n",
|
| 1469 |
+
" <td>B01A</td>\n",
|
| 1470 |
+
" <td>B01</td>\n",
|
| 1471 |
+
" <td>B</td>\n",
|
| 1472 |
+
" <td>2180.2853</td>\n",
|
| 1473 |
+
" <td>-14.0</td>\n",
|
| 1474 |
+
" <td>0.0464</td>\n",
|
| 1475 |
+
" <td>-4.7</td>\n",
|
| 1476 |
+
" <td>0.0</td>\n",
|
| 1477 |
+
" <td>...</td>\n",
|
| 1478 |
+
" <td>901.57</td>\n",
|
| 1479 |
+
" <td>11.88</td>\n",
|
| 1480 |
+
" <td>0.0</td>\n",
|
| 1481 |
+
" <td>2178.985813</td>\n",
|
| 1482 |
+
" <td>1.0</td>\n",
|
| 1483 |
+
" <td>218.54</td>\n",
|
| 1484 |
+
" <td>37.0</td>\n",
|
| 1485 |
+
" <td>-4.0</td>\n",
|
| 1486 |
+
" <td>0.0</td>\n",
|
| 1487 |
+
" <td>0</td>\n",
|
| 1488 |
+
" </tr>\n",
|
| 1489 |
+
" <tr>\n",
|
| 1490 |
+
" <th>1</th>\n",
|
| 1491 |
+
" <td>solid</td>\n",
|
| 1492 |
+
" <td>B01AE</td>\n",
|
| 1493 |
+
" <td>B01A</td>\n",
|
| 1494 |
+
" <td>B01</td>\n",
|
| 1495 |
+
" <td>B</td>\n",
|
| 1496 |
+
" <td>2180.2853</td>\n",
|
| 1497 |
+
" <td>-14.0</td>\n",
|
| 1498 |
+
" <td>0.0464</td>\n",
|
| 1499 |
+
" <td>-4.7</td>\n",
|
| 1500 |
+
" <td>0.0</td>\n",
|
| 1501 |
+
" <td>...</td>\n",
|
| 1502 |
+
" <td>429.04</td>\n",
|
| 1503 |
+
" <td>11.92</td>\n",
|
| 1504 |
+
" <td>0.0</td>\n",
|
| 1505 |
+
" <td>1208.645462</td>\n",
|
| 1506 |
+
" <td>1.0</td>\n",
|
| 1507 |
+
" <td>125.24</td>\n",
|
| 1508 |
+
" <td>16.0</td>\n",
|
| 1509 |
+
" <td>1.0</td>\n",
|
| 1510 |
+
" <td>0.0</td>\n",
|
| 1511 |
+
" <td>0</td>\n",
|
| 1512 |
+
" </tr>\n",
|
| 1513 |
+
" <tr>\n",
|
| 1514 |
+
" <th>2</th>\n",
|
| 1515 |
+
" <td>solid</td>\n",
|
| 1516 |
+
" <td>B01AE</td>\n",
|
| 1517 |
+
" <td>B01A</td>\n",
|
| 1518 |
+
" <td>B01</td>\n",
|
| 1519 |
+
" <td>B</td>\n",
|
| 1520 |
+
" <td>2180.2853</td>\n",
|
| 1521 |
+
" <td>-14.0</td>\n",
|
| 1522 |
+
" <td>0.0464</td>\n",
|
| 1523 |
+
" <td>-4.7</td>\n",
|
| 1524 |
+
" <td>0.0</td>\n",
|
| 1525 |
+
" <td>...</td>\n",
|
| 1526 |
+
" <td>495.89</td>\n",
|
| 1527 |
+
" <td>10.91</td>\n",
|
| 1528 |
+
" <td>0.0</td>\n",
|
| 1529 |
+
" <td>1268.641439</td>\n",
|
| 1530 |
+
" <td>1.0</td>\n",
|
| 1531 |
+
" <td>130.74</td>\n",
|
| 1532 |
+
" <td>18.0</td>\n",
|
| 1533 |
+
" <td>1.0</td>\n",
|
| 1534 |
+
" <td>0.0</td>\n",
|
| 1535 |
+
" <td>0</td>\n",
|
| 1536 |
+
" </tr>\n",
|
| 1537 |
+
" <tr>\n",
|
| 1538 |
+
" <th>3</th>\n",
|
| 1539 |
+
" <td>solid</td>\n",
|
| 1540 |
+
" <td>B01AE</td>\n",
|
| 1541 |
+
" <td>B01A</td>\n",
|
| 1542 |
+
" <td>B01</td>\n",
|
| 1543 |
+
" <td>B</td>\n",
|
| 1544 |
+
" <td>2180.2853</td>\n",
|
| 1545 |
+
" <td>-14.0</td>\n",
|
| 1546 |
+
" <td>0.0464</td>\n",
|
| 1547 |
+
" <td>-4.7</td>\n",
|
| 1548 |
+
" <td>0.0</td>\n",
|
| 1549 |
+
" <td>...</td>\n",
|
| 1550 |
+
" <td>519.89</td>\n",
|
| 1551 |
+
" <td>NaN</td>\n",
|
| 1552 |
+
" <td>0.0</td>\n",
|
| 1553 |
+
" <td>1810.033419</td>\n",
|
| 1554 |
+
" <td>1.0</td>\n",
|
| 1555 |
+
" <td>194.73</td>\n",
|
| 1556 |
+
" <td>16.0</td>\n",
|
| 1557 |
+
" <td>0.0</td>\n",
|
| 1558 |
+
" <td>0.0</td>\n",
|
| 1559 |
+
" <td>0</td>\n",
|
| 1560 |
+
" </tr>\n",
|
| 1561 |
+
" <tr>\n",
|
| 1562 |
+
" <th>4</th>\n",
|
| 1563 |
+
" <td>solid</td>\n",
|
| 1564 |
+
" <td>B01AE</td>\n",
|
| 1565 |
+
" <td>B01A</td>\n",
|
| 1566 |
+
" <td>B01</td>\n",
|
| 1567 |
+
" <td>B</td>\n",
|
| 1568 |
+
" <td>2180.2853</td>\n",
|
| 1569 |
+
" <td>-14.0</td>\n",
|
| 1570 |
+
" <td>0.0464</td>\n",
|
| 1571 |
+
" <td>-4.7</td>\n",
|
| 1572 |
+
" <td>0.0</td>\n",
|
| 1573 |
+
" <td>...</td>\n",
|
| 1574 |
+
" <td>435.41</td>\n",
|
| 1575 |
+
" <td>11.77</td>\n",
|
| 1576 |
+
" <td>0.0</td>\n",
|
| 1577 |
+
" <td>1068.426956</td>\n",
|
| 1578 |
+
" <td>1.0</td>\n",
|
| 1579 |
+
" <td>104.78</td>\n",
|
| 1580 |
+
" <td>15.0</td>\n",
|
| 1581 |
+
" <td>1.0</td>\n",
|
| 1582 |
+
" <td>0.0</td>\n",
|
| 1583 |
+
" <td>0</td>\n",
|
| 1584 |
+
" </tr>\n",
|
| 1585 |
+
" <tr>\n",
|
| 1586 |
+
" <th>5</th>\n",
|
| 1587 |
+
" <td>solid</td>\n",
|
| 1588 |
+
" <td>B01AE</td>\n",
|
| 1589 |
+
" <td>B01A</td>\n",
|
| 1590 |
+
" <td>B01</td>\n",
|
| 1591 |
+
" <td>B</td>\n",
|
| 1592 |
+
" <td>2180.2853</td>\n",
|
| 1593 |
+
" <td>-14.0</td>\n",
|
| 1594 |
+
" <td>0.0464</td>\n",
|
| 1595 |
+
" <td>-4.7</td>\n",
|
| 1596 |
+
" <td>0.0</td>\n",
|
| 1597 |
+
" <td>...</td>\n",
|
| 1598 |
+
" <td>495.67</td>\n",
|
| 1599 |
+
" <td>11.79</td>\n",
|
| 1600 |
+
" <td>0.0</td>\n",
|
| 1601 |
+
" <td>1429.669818</td>\n",
|
| 1602 |
+
" <td>1.0</td>\n",
|
| 1603 |
+
" <td>148.93</td>\n",
|
| 1604 |
+
" <td>18.0</td>\n",
|
| 1605 |
+
" <td>1.0</td>\n",
|
| 1606 |
+
" <td>0.0</td>\n",
|
| 1607 |
+
" <td>0</td>\n",
|
| 1608 |
+
" </tr>\n",
|
| 1609 |
+
" <tr>\n",
|
| 1610 |
+
" <th>6</th>\n",
|
| 1611 |
+
" <td>solid</td>\n",
|
| 1612 |
+
" <td>B01AE</td>\n",
|
| 1613 |
+
" <td>B01A</td>\n",
|
| 1614 |
+
" <td>B01</td>\n",
|
| 1615 |
+
" <td>B</td>\n",
|
| 1616 |
+
" <td>2180.2853</td>\n",
|
| 1617 |
+
" <td>-14.0</td>\n",
|
| 1618 |
+
" <td>0.0464</td>\n",
|
| 1619 |
+
" <td>-4.7</td>\n",
|
| 1620 |
+
" <td>0.0</td>\n",
|
| 1621 |
+
" <td>...</td>\n",
|
| 1622 |
+
" <td>702.02</td>\n",
|
| 1623 |
+
" <td>9.59</td>\n",
|
| 1624 |
+
" <td>0.0</td>\n",
|
| 1625 |
+
" <td>1619.710366</td>\n",
|
| 1626 |
+
" <td>1.0</td>\n",
|
| 1627 |
+
" <td>158.96</td>\n",
|
| 1628 |
+
" <td>27.0</td>\n",
|
| 1629 |
+
" <td>-3.0</td>\n",
|
| 1630 |
+
" <td>0.0</td>\n",
|
| 1631 |
+
" <td>0</td>\n",
|
| 1632 |
+
" </tr>\n",
|
| 1633 |
+
" <tr>\n",
|
| 1634 |
+
" <th>7</th>\n",
|
| 1635 |
+
" <td>solid</td>\n",
|
| 1636 |
+
" <td>B01AE</td>\n",
|
| 1637 |
+
" <td>B01A</td>\n",
|
| 1638 |
+
" <td>B01</td>\n",
|
| 1639 |
+
" <td>B</td>\n",
|
| 1640 |
+
" <td>2180.2853</td>\n",
|
| 1641 |
+
" <td>-14.0</td>\n",
|
| 1642 |
+
" <td>0.0464</td>\n",
|
| 1643 |
+
" <td>-4.7</td>\n",
|
| 1644 |
+
" <td>0.0</td>\n",
|
| 1645 |
+
" <td>...</td>\n",
|
| 1646 |
+
" <td>424.98</td>\n",
|
| 1647 |
+
" <td>10.66</td>\n",
|
| 1648 |
+
" <td>0.0</td>\n",
|
| 1649 |
+
" <td>1414.684072</td>\n",
|
| 1650 |
+
" <td>1.0</td>\n",
|
| 1651 |
+
" <td>149.31</td>\n",
|
| 1652 |
+
" <td>16.0</td>\n",
|
| 1653 |
+
" <td>1.0</td>\n",
|
| 1654 |
+
" <td>0.0</td>\n",
|
| 1655 |
+
" <td>0</td>\n",
|
| 1656 |
+
" </tr>\n",
|
| 1657 |
+
" <tr>\n",
|
| 1658 |
+
" <th>8</th>\n",
|
| 1659 |
+
" <td>solid</td>\n",
|
| 1660 |
+
" <td>B01AE</td>\n",
|
| 1661 |
+
" <td>B01A</td>\n",
|
| 1662 |
+
" <td>B01</td>\n",
|
| 1663 |
+
" <td>B</td>\n",
|
| 1664 |
+
" <td>2180.2853</td>\n",
|
| 1665 |
+
" <td>-14.0</td>\n",
|
| 1666 |
+
" <td>0.0464</td>\n",
|
| 1667 |
+
" <td>-4.7</td>\n",
|
| 1668 |
+
" <td>0.0</td>\n",
|
| 1669 |
+
" <td>...</td>\n",
|
| 1670 |
+
" <td>116.95</td>\n",
|
| 1671 |
+
" <td>4.11</td>\n",
|
| 1672 |
+
" <td>0.0</td>\n",
|
| 1673 |
+
" <td>247.024574</td>\n",
|
| 1674 |
+
" <td>0.0</td>\n",
|
| 1675 |
+
" <td>20.90</td>\n",
|
| 1676 |
+
" <td>6.0</td>\n",
|
| 1677 |
+
" <td>-2.0</td>\n",
|
| 1678 |
+
" <td>1.0</td>\n",
|
| 1679 |
+
" <td>0</td>\n",
|
| 1680 |
+
" </tr>\n",
|
| 1681 |
+
" <tr>\n",
|
| 1682 |
+
" <th>9</th>\n",
|
| 1683 |
+
" <td>solid</td>\n",
|
| 1684 |
+
" <td>B01AE</td>\n",
|
| 1685 |
+
" <td>B01A</td>\n",
|
| 1686 |
+
" <td>B01</td>\n",
|
| 1687 |
+
" <td>B</td>\n",
|
| 1688 |
+
" <td>2180.2853</td>\n",
|
| 1689 |
+
" <td>-14.0</td>\n",
|
| 1690 |
+
" <td>0.0464</td>\n",
|
| 1691 |
+
" <td>-4.7</td>\n",
|
| 1692 |
+
" <td>0.0</td>\n",
|
| 1693 |
+
" <td>...</td>\n",
|
| 1694 |
+
" <td>477.85</td>\n",
|
| 1695 |
+
" <td>8.68</td>\n",
|
| 1696 |
+
" <td>0.0</td>\n",
|
| 1697 |
+
" <td>1354.567405</td>\n",
|
| 1698 |
+
" <td>1.0</td>\n",
|
| 1699 |
+
" <td>138.79</td>\n",
|
| 1700 |
+
" <td>18.0</td>\n",
|
| 1701 |
+
" <td>3.0</td>\n",
|
| 1702 |
+
" <td>0.0</td>\n",
|
| 1703 |
+
" <td>0</td>\n",
|
| 1704 |
+
" </tr>\n",
|
| 1705 |
+
" </tbody>\n",
|
| 1706 |
+
"</table>\n",
|
| 1707 |
+
"<p>10 rows × 49 columns</p>\n",
|
| 1708 |
+
"</div>"
|
| 1709 |
+
],
|
| 1710 |
+
"text/plain": [
|
| 1711 |
+
" state_x level4_x level3_x level2_x level1_x Molecular Weight_x logP_x \\\n",
|
| 1712 |
+
"0 solid B01AE B01A B01 B 2180.2853 -14.0 \n",
|
| 1713 |
+
"1 solid B01AE B01A B01 B 2180.2853 -14.0 \n",
|
| 1714 |
+
"2 solid B01AE B01A B01 B 2180.2853 -14.0 \n",
|
| 1715 |
+
"3 solid B01AE B01A B01 B 2180.2853 -14.0 \n",
|
| 1716 |
+
"4 solid B01AE B01A B01 B 2180.2853 -14.0 \n",
|
| 1717 |
+
"5 solid B01AE B01A B01 B 2180.2853 -14.0 \n",
|
| 1718 |
+
"6 solid B01AE B01A B01 B 2180.2853 -14.0 \n",
|
| 1719 |
+
"7 solid B01AE B01A B01 B 2180.2853 -14.0 \n",
|
| 1720 |
+
"8 solid B01AE B01A B01 B 2180.2853 -14.0 \n",
|
| 1721 |
+
"9 solid B01AE B01A B01 B 2180.2853 -14.0 \n",
|
| 1722 |
+
"\n",
|
| 1723 |
+
" Water Solubility_x logS_x Bioavailability_x ... \\\n",
|
| 1724 |
+
"0 0.0464 -4.7 0.0 ... \n",
|
| 1725 |
+
"1 0.0464 -4.7 0.0 ... \n",
|
| 1726 |
+
"2 0.0464 -4.7 0.0 ... \n",
|
| 1727 |
+
"3 0.0464 -4.7 0.0 ... \n",
|
| 1728 |
+
"4 0.0464 -4.7 0.0 ... \n",
|
| 1729 |
+
"5 0.0464 -4.7 0.0 ... \n",
|
| 1730 |
+
"6 0.0464 -4.7 0.0 ... \n",
|
| 1731 |
+
"7 0.0464 -4.7 0.0 ... \n",
|
| 1732 |
+
"8 0.0464 -4.7 0.0 ... \n",
|
| 1733 |
+
"9 0.0464 -4.7 0.0 ... \n",
|
| 1734 |
+
"\n",
|
| 1735 |
+
" Polar Surface Area (PSA)_y pKa (strongest basic)_y Ghose Filter_y \\\n",
|
| 1736 |
+
"0 901.57 11.88 0.0 \n",
|
| 1737 |
+
"1 429.04 11.92 0.0 \n",
|
| 1738 |
+
"2 495.89 10.91 0.0 \n",
|
| 1739 |
+
"3 519.89 NaN 0.0 \n",
|
| 1740 |
+
"4 435.41 11.77 0.0 \n",
|
| 1741 |
+
"5 495.67 11.79 0.0 \n",
|
| 1742 |
+
"6 702.02 9.59 0.0 \n",
|
| 1743 |
+
"7 424.98 10.66 0.0 \n",
|
| 1744 |
+
"8 116.95 4.11 0.0 \n",
|
| 1745 |
+
"9 477.85 8.68 0.0 \n",
|
| 1746 |
+
"\n",
|
| 1747 |
+
" Monoisotopic Weight_y MDDR-Like Rule_y Polarizability_y \\\n",
|
| 1748 |
+
"0 2178.985813 1.0 218.54 \n",
|
| 1749 |
+
"1 1208.645462 1.0 125.24 \n",
|
| 1750 |
+
"2 1268.641439 1.0 130.74 \n",
|
| 1751 |
+
"3 1810.033419 1.0 194.73 \n",
|
| 1752 |
+
"4 1068.426956 1.0 104.78 \n",
|
| 1753 |
+
"5 1429.669818 1.0 148.93 \n",
|
| 1754 |
+
"6 1619.710366 1.0 158.96 \n",
|
| 1755 |
+
"7 1414.684072 1.0 149.31 \n",
|
| 1756 |
+
"8 247.024574 0.0 20.90 \n",
|
| 1757 |
+
"9 1354.567405 1.0 138.79 \n",
|
| 1758 |
+
"\n",
|
| 1759 |
+
" H Bond Acceptor Count_y Physiological Charge_y Rule of Five_y \\\n",
|
| 1760 |
+
"0 37.0 -4.0 0.0 \n",
|
| 1761 |
+
"1 16.0 1.0 0.0 \n",
|
| 1762 |
+
"2 18.0 1.0 0.0 \n",
|
| 1763 |
+
"3 16.0 0.0 0.0 \n",
|
| 1764 |
+
"4 15.0 1.0 0.0 \n",
|
| 1765 |
+
"5 18.0 1.0 0.0 \n",
|
| 1766 |
+
"6 27.0 -3.0 0.0 \n",
|
| 1767 |
+
"7 16.0 1.0 0.0 \n",
|
| 1768 |
+
"8 6.0 -2.0 1.0 \n",
|
| 1769 |
+
"9 18.0 3.0 0.0 \n",
|
| 1770 |
+
"\n",
|
| 1771 |
+
" interaction \n",
|
| 1772 |
+
"0 0 \n",
|
| 1773 |
+
"1 0 \n",
|
| 1774 |
+
"2 0 \n",
|
| 1775 |
+
"3 0 \n",
|
| 1776 |
+
"4 0 \n",
|
| 1777 |
+
"5 0 \n",
|
| 1778 |
+
"6 0 \n",
|
| 1779 |
+
"7 0 \n",
|
| 1780 |
+
"8 0 \n",
|
| 1781 |
+
"9 0 \n",
|
| 1782 |
+
"\n",
|
| 1783 |
+
"[10 rows x 49 columns]"
|
| 1784 |
+
]
|
| 1785 |
+
},
|
| 1786 |
+
"execution_count": 49,
|
| 1787 |
+
"metadata": {},
|
| 1788 |
+
"output_type": "execute_result"
|
| 1789 |
+
}
|
| 1790 |
+
],
|
| 1791 |
+
"source": [
|
| 1792 |
+
"# drop label name_x and name_y\n",
|
| 1793 |
+
"catboost_df = catboost_df.drop(['name_x', 'name_y'], axis=1)\n",
|
| 1794 |
+
"catboost_df.head(10)"
|
| 1795 |
+
]
|
| 1796 |
+
},
|
| 1797 |
+
{
|
| 1798 |
+
"cell_type": "code",
|
| 1799 |
+
"execution_count": 68,
|
| 1800 |
+
"metadata": {},
|
| 1801 |
+
"outputs": [],
|
| 1802 |
+
"source": [
|
| 1803 |
+
"# create test and train set\n",
|
| 1804 |
+
"from sklearn.model_selection import train_test_split\n",
|
| 1805 |
+
"X, y = catboost_df.drop('interaction', axis=1), catboost_df['interaction']\n",
|
| 1806 |
+
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)"
|
| 1807 |
+
]
|
| 1808 |
+
},
|
| 1809 |
+
{
|
| 1810 |
+
"cell_type": "code",
|
| 1811 |
+
"execution_count": 66,
|
| 1812 |
+
"metadata": {},
|
| 1813 |
+
"outputs": [
|
| 1814 |
+
{
|
| 1815 |
+
"name": "stdout",
|
| 1816 |
+
"output_type": "stream",
|
| 1817 |
+
"text": [
|
| 1818 |
+
"['state_x', 'level4_x', 'level3_x', 'level2_x', 'level1_x', 'state_y', 'level4_y', 'level3_y', 'level2_y', 'level1_y']\n",
|
| 1819 |
+
"['state_x', 'level4_x', 'level3_x', 'level2_x', 'level1_x', 'Molecular Weight_x', 'logP_x', 'Water Solubility_x', 'logS_x', 'Bioavailability_x', 'pKa (strongest acidic)_x', 'Refractivity_x', 'Number of Rings_x', 'H Bond Donor Count_x', 'Rotatable Bond Count_x', 'Polar Surface Area (PSA)_x', 'pKa (strongest basic)_x', 'Ghose Filter_x', 'Monoisotopic Weight_x', 'MDDR-Like Rule_x', 'Polarizability_x', 'H Bond Acceptor Count_x', 'Physiological Charge_x', 'Rule of Five_x', 'state_y', 'level4_y', 'level3_y', 'level2_y', 'level1_y', 'Molecular Weight_y', 'logP_y', 'Water Solubility_y', 'logS_y', 'Bioavailability_y', 'pKa (strongest acidic)_y', 'Refractivity_y', 'Number of Rings_y', 'H Bond Donor Count_y', 'Rotatable Bond Count_y', 'Polar Surface Area (PSA)_y', 'pKa (strongest basic)_y', 'Ghose Filter_y', 'Monoisotopic Weight_y', 'MDDR-Like Rule_y', 'Polarizability_y', 'H Bond Acceptor Count_y', 'Physiological Charge_y', 'Rule of Five_y', 'interaction']\n",
|
| 1820 |
+
"['Molecular Weight_x', 'logP_x', 'Water Solubility_x', 'logS_x', 'Bioavailability_x', 'pKa (strongest acidic)_x', 'Refractivity_x', 'Number of Rings_x', 'H Bond Donor Count_x', 'Rotatable Bond Count_x', 'Polar Surface Area (PSA)_x', 'pKa (strongest basic)_x', 'Ghose Filter_x', 'Monoisotopic Weight_x', 'MDDR-Like Rule_x', 'Polarizability_x', 'H Bond Acceptor Count_x', 'Physiological Charge_x', 'Rule of Five_x', 'Molecular Weight_y', 'logP_y', 'Water Solubility_y', 'logS_y', 'Bioavailability_y', 'pKa (strongest acidic)_y', 'Refractivity_y', 'Number of Rings_y', 'H Bond Donor Count_y', 'Rotatable Bond Count_y', 'Polar Surface Area (PSA)_y', 'pKa (strongest basic)_y', 'Ghose Filter_y', 'Monoisotopic Weight_y', 'MDDR-Like Rule_y', 'Polarizability_y', 'H Bond Acceptor Count_y', 'Physiological Charge_y', 'Rule of Five_y']\n",
|
| 1821 |
+
"10 + 38 = 49\n"
|
| 1822 |
+
]
|
| 1823 |
+
}
|
| 1824 |
+
],
|
| 1825 |
+
"source": [
|
| 1826 |
+
"# get all the columns whose dtype is object\n",
|
| 1827 |
+
"cat_features = list(catboost_df.select_dtypes(include=['object']).columns)\n",
|
| 1828 |
+
"print(cat_features)\n",
|
| 1829 |
+
"print(list(catboost_df.columns))\n",
|
| 1830 |
+
"float_features = list(catboost_df.select_dtypes(include=['float64']).columns)\n",
|
| 1831 |
+
"print(float_features)\n",
|
| 1832 |
+
"print(f\"{len(cat_features)} + {len(float_features)} = {len(catboost_df.columns)}\")"
|
| 1833 |
+
]
|
| 1834 |
+
}
|
| 1835 |
+
],
|
| 1836 |
+
"metadata": {
|
| 1837 |
+
"kernelspec": {
|
| 1838 |
+
"display_name": "Python 3",
|
| 1839 |
+
"language": "python",
|
| 1840 |
+
"name": "python3"
|
| 1841 |
+
},
|
| 1842 |
+
"language_info": {
|
| 1843 |
+
"codemirror_mode": {
|
| 1844 |
+
"name": "ipython",
|
| 1845 |
+
"version": 3
|
| 1846 |
+
},
|
| 1847 |
+
"file_extension": ".py",
|
| 1848 |
+
"mimetype": "text/x-python",
|
| 1849 |
+
"name": "python",
|
| 1850 |
+
"nbconvert_exporter": "python",
|
| 1851 |
+
"pygments_lexer": "ipython3",
|
| 1852 |
+
"version": "3.8.10"
|
| 1853 |
+
},
|
| 1854 |
+
"orig_nbformat": 4
|
| 1855 |
+
},
|
| 1856 |
+
"nbformat": 4,
|
| 1857 |
+
"nbformat_minor": 2
|
| 1858 |
+
}
|
catboost/train.py
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
import pandas as pd
|
| 3 |
+
from sklearn.model_selection import train_test_split
|
| 4 |
+
from sklearn.experimental import enable_halving_search_cv
|
| 5 |
+
from sklearn.model_selection import HalvingGridSearchCV, RandomizedSearchCV
|
| 6 |
+
from catboost import CatBoostClassifier, Pool
|
| 7 |
+
from sklearn.metrics import roc_auc_score
|
| 8 |
+
from sklearn.metrics import accuracy_score
|
| 9 |
+
from pandas.core.common import random_state
|
| 10 |
+
|
| 11 |
+
# load catboost_df
|
| 12 |
+
catboost_df = pd.read_csv('datasets/catboost_df.csv', index_col=0)
|
| 13 |
+
# drop label name_x and name_y
|
| 14 |
+
catboost_df = catboost_df.drop(['name_x', 'name_y'], axis=1)
|
| 15 |
+
|
| 16 |
+
# get the categorical and float features
|
| 17 |
+
cat_features = list(catboost_df.select_dtypes(include=['object']).columns)
|
| 18 |
+
float_features = list(catboost_df.select_dtypes(include=['float64']).columns)
|
| 19 |
+
|
| 20 |
+
for feature in float_features:
|
| 21 |
+
# Fill NaN values with the mean of non-missing values in the same column
|
| 22 |
+
mean_value = catboost_df[feature].mean()
|
| 23 |
+
catboost_df[feature].fillna(mean_value, inplace=True)
|
| 24 |
+
|
| 25 |
+
for feature in cat_features:
|
| 26 |
+
catboost_df[feature] = catboost_df[feature].astype(str)
|
| 27 |
+
|
| 28 |
+
# create test and train set
|
| 29 |
+
X, y = catboost_df.drop('interaction', axis=1), catboost_df['interaction']
|
| 30 |
+
X_train, X_test, y_train, y_test = train_test_split(
|
| 31 |
+
X, y, test_size=0.2, random_state=42)
|
| 32 |
+
|
| 33 |
+
catb_model = CatBoostClassifier(random_state=42, task_type="GPU", max_ctr_complexity=1, boosting_type="Plain",
|
| 34 |
+
cat_features=cat_features, gpu_ram_part=0.4)
|
| 35 |
+
catb_param = {
|
| 36 |
+
'max_depth': [6],
|
| 37 |
+
'learning_rate': [0.01],
|
| 38 |
+
'reg_lambda': [2.5],
|
| 39 |
+
'n_estimators': [1000],
|
| 40 |
+
}
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
# pool_train = Pool(X_train, y_train, cat_features = cat_features)
|
| 44 |
+
# pool_test = Pool(X_test, cat_features = cat_features)
|
| 45 |
+
|
| 46 |
+
# grid search
|
| 47 |
+
grid_search = HalvingGridSearchCV(
|
| 48 |
+
catb_model, catb_param, cv=3, n_jobs=-1, verbose=2)
|
| 49 |
+
grid_search.fit(X_train, y_train)
|
| 50 |
+
|
| 51 |
+
print("Done")
|
| 52 |
+
|
| 53 |
+
best_model = grid_search.best_estimator_
|
| 54 |
+
best_model.save_model('models/catboost_model2.cbm')
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
# print best parameters
|
| 58 |
+
print(grid_search.best_params_)
|
| 59 |
+
# print best score
|
| 60 |
+
print(grid_search.best_score_)
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
y_p = grid_search.predict_proba(X_test)
|
| 64 |
+
print(f"Test AUC_ROC score = {roc_auc_score(y_test, y_p[:, 1])}")
|
| 65 |
+
|
| 66 |
+
print("---------------------Done--------------------------------")
|
dimensionality_reduction.ipynb
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
link_prediction.ipynb
ADDED
|
@@ -0,0 +1,1482 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "code",
|
| 5 |
+
"execution_count": 1,
|
| 6 |
+
"metadata": {},
|
| 7 |
+
"outputs": [],
|
| 8 |
+
"source": [
|
| 9 |
+
"import pandas as pd\n",
|
| 10 |
+
"import numpy as np\n",
|
| 11 |
+
"import json"
|
| 12 |
+
]
|
| 13 |
+
},
|
| 14 |
+
{
|
| 15 |
+
"cell_type": "code",
|
| 16 |
+
"execution_count": 2,
|
| 17 |
+
"metadata": {},
|
| 18 |
+
"outputs": [],
|
| 19 |
+
"source": [
|
| 20 |
+
"with open('data/interactions.json') as f:\n",
|
| 21 |
+
" ddi_json = json.load(f)"
|
| 22 |
+
]
|
| 23 |
+
},
|
| 24 |
+
{
|
| 25 |
+
"cell_type": "code",
|
| 26 |
+
"execution_count": 3,
|
| 27 |
+
"metadata": {},
|
| 28 |
+
"outputs": [
|
| 29 |
+
{
|
| 30 |
+
"data": {
|
| 31 |
+
"text/html": [
|
| 32 |
+
"<div>\n",
|
| 33 |
+
"<style scoped>\n",
|
| 34 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
| 35 |
+
" vertical-align: middle;\n",
|
| 36 |
+
" }\n",
|
| 37 |
+
"\n",
|
| 38 |
+
" .dataframe tbody tr th {\n",
|
| 39 |
+
" vertical-align: top;\n",
|
| 40 |
+
" }\n",
|
| 41 |
+
"\n",
|
| 42 |
+
" .dataframe thead th {\n",
|
| 43 |
+
" text-align: right;\n",
|
| 44 |
+
" }\n",
|
| 45 |
+
"</style>\n",
|
| 46 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
| 47 |
+
" <thead>\n",
|
| 48 |
+
" <tr style=\"text-align: right;\">\n",
|
| 49 |
+
" <th></th>\n",
|
| 50 |
+
" <th>Unnamed: 0</th>\n",
|
| 51 |
+
" <th>name</th>\n",
|
| 52 |
+
" <th>state</th>\n",
|
| 53 |
+
" <th>level4</th>\n",
|
| 54 |
+
" <th>level3</th>\n",
|
| 55 |
+
" <th>level2</th>\n",
|
| 56 |
+
" <th>level1</th>\n",
|
| 57 |
+
" <th>Molecular Weight</th>\n",
|
| 58 |
+
" <th>logP</th>\n",
|
| 59 |
+
" <th>Water Solubility</th>\n",
|
| 60 |
+
" <th>...</th>\n",
|
| 61 |
+
" <th>Rotatable Bond Count</th>\n",
|
| 62 |
+
" <th>Polar Surface Area (PSA)</th>\n",
|
| 63 |
+
" <th>pKa (strongest basic)</th>\n",
|
| 64 |
+
" <th>Ghose Filter</th>\n",
|
| 65 |
+
" <th>Monoisotopic Weight</th>\n",
|
| 66 |
+
" <th>MDDR-Like Rule</th>\n",
|
| 67 |
+
" <th>Polarizability</th>\n",
|
| 68 |
+
" <th>H Bond Acceptor Count</th>\n",
|
| 69 |
+
" <th>Physiological Charge</th>\n",
|
| 70 |
+
" <th>Rule of Five</th>\n",
|
| 71 |
+
" </tr>\n",
|
| 72 |
+
" </thead>\n",
|
| 73 |
+
" <tbody>\n",
|
| 74 |
+
" <tr>\n",
|
| 75 |
+
" <th>0</th>\n",
|
| 76 |
+
" <td>5</td>\n",
|
| 77 |
+
" <td>Bivalirudin</td>\n",
|
| 78 |
+
" <td>solid</td>\n",
|
| 79 |
+
" <td>B01AE</td>\n",
|
| 80 |
+
" <td>B01A</td>\n",
|
| 81 |
+
" <td>B01</td>\n",
|
| 82 |
+
" <td>B</td>\n",
|
| 83 |
+
" <td>2180.2853</td>\n",
|
| 84 |
+
" <td>-14.00</td>\n",
|
| 85 |
+
" <td>0.04640</td>\n",
|
| 86 |
+
" <td>...</td>\n",
|
| 87 |
+
" <td>66.0</td>\n",
|
| 88 |
+
" <td>901.57</td>\n",
|
| 89 |
+
" <td>11.88</td>\n",
|
| 90 |
+
" <td>0.0</td>\n",
|
| 91 |
+
" <td>2178.985813</td>\n",
|
| 92 |
+
" <td>1.0</td>\n",
|
| 93 |
+
" <td>218.54</td>\n",
|
| 94 |
+
" <td>37.0</td>\n",
|
| 95 |
+
" <td>-4.0</td>\n",
|
| 96 |
+
" <td>0.0</td>\n",
|
| 97 |
+
" </tr>\n",
|
| 98 |
+
" <tr>\n",
|
| 99 |
+
" <th>1</th>\n",
|
| 100 |
+
" <td>6</td>\n",
|
| 101 |
+
" <td>Leuprolide</td>\n",
|
| 102 |
+
" <td>solid</td>\n",
|
| 103 |
+
" <td>L02AE</td>\n",
|
| 104 |
+
" <td>L02A</td>\n",
|
| 105 |
+
" <td>L02</td>\n",
|
| 106 |
+
" <td>L</td>\n",
|
| 107 |
+
" <td>1209.3983</td>\n",
|
| 108 |
+
" <td>-2.40</td>\n",
|
| 109 |
+
" <td>0.03380</td>\n",
|
| 110 |
+
" <td>...</td>\n",
|
| 111 |
+
" <td>32.0</td>\n",
|
| 112 |
+
" <td>429.04</td>\n",
|
| 113 |
+
" <td>11.92</td>\n",
|
| 114 |
+
" <td>0.0</td>\n",
|
| 115 |
+
" <td>1208.645462</td>\n",
|
| 116 |
+
" <td>1.0</td>\n",
|
| 117 |
+
" <td>125.24</td>\n",
|
| 118 |
+
" <td>16.0</td>\n",
|
| 119 |
+
" <td>1.0</td>\n",
|
| 120 |
+
" <td>0.0</td>\n",
|
| 121 |
+
" </tr>\n",
|
| 122 |
+
" <tr>\n",
|
| 123 |
+
" <th>2</th>\n",
|
| 124 |
+
" <td>13</td>\n",
|
| 125 |
+
" <td>Goserelin</td>\n",
|
| 126 |
+
" <td>solid</td>\n",
|
| 127 |
+
" <td>L02AE</td>\n",
|
| 128 |
+
" <td>L02A</td>\n",
|
| 129 |
+
" <td>L02</td>\n",
|
| 130 |
+
" <td>L</td>\n",
|
| 131 |
+
" <td>1269.4105</td>\n",
|
| 132 |
+
" <td>-5.10</td>\n",
|
| 133 |
+
" <td>0.02830</td>\n",
|
| 134 |
+
" <td>...</td>\n",
|
| 135 |
+
" <td>33.0</td>\n",
|
| 136 |
+
" <td>495.89</td>\n",
|
| 137 |
+
" <td>10.91</td>\n",
|
| 138 |
+
" <td>0.0</td>\n",
|
| 139 |
+
" <td>1268.641439</td>\n",
|
| 140 |
+
" <td>1.0</td>\n",
|
| 141 |
+
" <td>130.74</td>\n",
|
| 142 |
+
" <td>18.0</td>\n",
|
| 143 |
+
" <td>1.0</td>\n",
|
| 144 |
+
" <td>0.0</td>\n",
|
| 145 |
+
" </tr>\n",
|
| 146 |
+
" <tr>\n",
|
| 147 |
+
" <th>3</th>\n",
|
| 148 |
+
" <td>25</td>\n",
|
| 149 |
+
" <td>Gramicidin D</td>\n",
|
| 150 |
+
" <td>liquid</td>\n",
|
| 151 |
+
" <td>R02AB</td>\n",
|
| 152 |
+
" <td>R02A</td>\n",
|
| 153 |
+
" <td>R02</td>\n",
|
| 154 |
+
" <td>R</td>\n",
|
| 155 |
+
" <td>1811.2530</td>\n",
|
| 156 |
+
" <td>5.96</td>\n",
|
| 157 |
+
" <td>0.00390</td>\n",
|
| 158 |
+
" <td>...</td>\n",
|
| 159 |
+
" <td>50.0</td>\n",
|
| 160 |
+
" <td>519.89</td>\n",
|
| 161 |
+
" <td>NaN</td>\n",
|
| 162 |
+
" <td>0.0</td>\n",
|
| 163 |
+
" <td>1810.033419</td>\n",
|
| 164 |
+
" <td>1.0</td>\n",
|
| 165 |
+
" <td>194.73</td>\n",
|
| 166 |
+
" <td>16.0</td>\n",
|
| 167 |
+
" <td>0.0</td>\n",
|
| 168 |
+
" <td>0.0</td>\n",
|
| 169 |
+
" </tr>\n",
|
| 170 |
+
" <tr>\n",
|
| 171 |
+
" <th>4</th>\n",
|
| 172 |
+
" <td>33</td>\n",
|
| 173 |
+
" <td>Desmopressin</td>\n",
|
| 174 |
+
" <td>solid</td>\n",
|
| 175 |
+
" <td>H01BA</td>\n",
|
| 176 |
+
" <td>H01B</td>\n",
|
| 177 |
+
" <td>H01</td>\n",
|
| 178 |
+
" <td>H</td>\n",
|
| 179 |
+
" <td>1069.2200</td>\n",
|
| 180 |
+
" <td>-6.10</td>\n",
|
| 181 |
+
" <td>0.11000</td>\n",
|
| 182 |
+
" <td>...</td>\n",
|
| 183 |
+
" <td>19.0</td>\n",
|
| 184 |
+
" <td>435.41</td>\n",
|
| 185 |
+
" <td>11.77</td>\n",
|
| 186 |
+
" <td>0.0</td>\n",
|
| 187 |
+
" <td>1068.426956</td>\n",
|
| 188 |
+
" <td>1.0</td>\n",
|
| 189 |
+
" <td>104.78</td>\n",
|
| 190 |
+
" <td>15.0</td>\n",
|
| 191 |
+
" <td>1.0</td>\n",
|
| 192 |
+
" <td>0.0</td>\n",
|
| 193 |
+
" </tr>\n",
|
| 194 |
+
" <tr>\n",
|
| 195 |
+
" <th>...</th>\n",
|
| 196 |
+
" <td>...</td>\n",
|
| 197 |
+
" <td>...</td>\n",
|
| 198 |
+
" <td>...</td>\n",
|
| 199 |
+
" <td>...</td>\n",
|
| 200 |
+
" <td>...</td>\n",
|
| 201 |
+
" <td>...</td>\n",
|
| 202 |
+
" <td>...</td>\n",
|
| 203 |
+
" <td>...</td>\n",
|
| 204 |
+
" <td>...</td>\n",
|
| 205 |
+
" <td>...</td>\n",
|
| 206 |
+
" <td>...</td>\n",
|
| 207 |
+
" <td>...</td>\n",
|
| 208 |
+
" <td>...</td>\n",
|
| 209 |
+
" <td>...</td>\n",
|
| 210 |
+
" <td>...</td>\n",
|
| 211 |
+
" <td>...</td>\n",
|
| 212 |
+
" <td>...</td>\n",
|
| 213 |
+
" <td>...</td>\n",
|
| 214 |
+
" <td>...</td>\n",
|
| 215 |
+
" <td>...</td>\n",
|
| 216 |
+
" <td>...</td>\n",
|
| 217 |
+
" </tr>\n",
|
| 218 |
+
" <tr>\n",
|
| 219 |
+
" <th>2625</th>\n",
|
| 220 |
+
" <td>14553</td>\n",
|
| 221 |
+
" <td>Belumosudil</td>\n",
|
| 222 |
+
" <td>solid</td>\n",
|
| 223 |
+
" <td>L04AA</td>\n",
|
| 224 |
+
" <td>L04A</td>\n",
|
| 225 |
+
" <td>L04</td>\n",
|
| 226 |
+
" <td>L</td>\n",
|
| 227 |
+
" <td>452.5180</td>\n",
|
| 228 |
+
" <td>4.65</td>\n",
|
| 229 |
+
" <td>0.00289</td>\n",
|
| 230 |
+
" <td>...</td>\n",
|
| 231 |
+
" <td>7.0</td>\n",
|
| 232 |
+
" <td>104.82</td>\n",
|
| 233 |
+
" <td>4.11</td>\n",
|
| 234 |
+
" <td>0.0</td>\n",
|
| 235 |
+
" <td>452.196074</td>\n",
|
| 236 |
+
" <td>1.0</td>\n",
|
| 237 |
+
" <td>49.55</td>\n",
|
| 238 |
+
" <td>6.0</td>\n",
|
| 239 |
+
" <td>0.0</td>\n",
|
| 240 |
+
" <td>1.0</td>\n",
|
| 241 |
+
" </tr>\n",
|
| 242 |
+
" <tr>\n",
|
| 243 |
+
" <th>2626</th>\n",
|
| 244 |
+
" <td>14688</td>\n",
|
| 245 |
+
" <td>Tebipenem pivoxil</td>\n",
|
| 246 |
+
" <td>NaN</td>\n",
|
| 247 |
+
" <td>J01DH</td>\n",
|
| 248 |
+
" <td>J01D</td>\n",
|
| 249 |
+
" <td>J01</td>\n",
|
| 250 |
+
" <td>J</td>\n",
|
| 251 |
+
" <td>497.6300</td>\n",
|
| 252 |
+
" <td>1.59</td>\n",
|
| 253 |
+
" <td>0.16700</td>\n",
|
| 254 |
+
" <td>...</td>\n",
|
| 255 |
+
" <td>9.0</td>\n",
|
| 256 |
+
" <td>108.74</td>\n",
|
| 257 |
+
" <td>6.27</td>\n",
|
| 258 |
+
" <td>0.0</td>\n",
|
| 259 |
+
" <td>497.165428</td>\n",
|
| 260 |
+
" <td>1.0</td>\n",
|
| 261 |
+
" <td>53.39</td>\n",
|
| 262 |
+
" <td>6.0</td>\n",
|
| 263 |
+
" <td>0.0</td>\n",
|
| 264 |
+
" <td>1.0</td>\n",
|
| 265 |
+
" </tr>\n",
|
| 266 |
+
" <tr>\n",
|
| 267 |
+
" <th>2627</th>\n",
|
| 268 |
+
" <td>14698</td>\n",
|
| 269 |
+
" <td>Tosufloxacin</td>\n",
|
| 270 |
+
" <td>NaN</td>\n",
|
| 271 |
+
" <td>J01MA</td>\n",
|
| 272 |
+
" <td>J01M</td>\n",
|
| 273 |
+
" <td>J01</td>\n",
|
| 274 |
+
" <td>J</td>\n",
|
| 275 |
+
" <td>404.3490</td>\n",
|
| 276 |
+
" <td>0.47</td>\n",
|
| 277 |
+
" <td>0.07620</td>\n",
|
| 278 |
+
" <td>...</td>\n",
|
| 279 |
+
" <td>3.0</td>\n",
|
| 280 |
+
" <td>99.76</td>\n",
|
| 281 |
+
" <td>9.80</td>\n",
|
| 282 |
+
" <td>1.0</td>\n",
|
| 283 |
+
" <td>404.109625</td>\n",
|
| 284 |
+
" <td>0.0</td>\n",
|
| 285 |
+
" <td>37.18</td>\n",
|
| 286 |
+
" <td>7.0</td>\n",
|
| 287 |
+
" <td>0.0</td>\n",
|
| 288 |
+
" <td>1.0</td>\n",
|
| 289 |
+
" </tr>\n",
|
| 290 |
+
" <tr>\n",
|
| 291 |
+
" <th>2628</th>\n",
|
| 292 |
+
" <td>14931</td>\n",
|
| 293 |
+
" <td>Linzagolix</td>\n",
|
| 294 |
+
" <td>solid</td>\n",
|
| 295 |
+
" <td>H01CC</td>\n",
|
| 296 |
+
" <td>H01C</td>\n",
|
| 297 |
+
" <td>H01</td>\n",
|
| 298 |
+
" <td>H</td>\n",
|
| 299 |
+
" <td>508.4200</td>\n",
|
| 300 |
+
" <td>3.88</td>\n",
|
| 301 |
+
" <td>0.00198</td>\n",
|
| 302 |
+
" <td>...</td>\n",
|
| 303 |
+
" <td>6.0</td>\n",
|
| 304 |
+
" <td>114.40</td>\n",
|
| 305 |
+
" <td>-3.50</td>\n",
|
| 306 |
+
" <td>0.0</td>\n",
|
| 307 |
+
" <td>508.055206</td>\n",
|
| 308 |
+
" <td>1.0</td>\n",
|
| 309 |
+
" <td>45.39</td>\n",
|
| 310 |
+
" <td>7.0</td>\n",
|
| 311 |
+
" <td>-1.0</td>\n",
|
| 312 |
+
" <td>0.0</td>\n",
|
| 313 |
+
" </tr>\n",
|
| 314 |
+
" <tr>\n",
|
| 315 |
+
" <th>2629</th>\n",
|
| 316 |
+
" <td>14995</td>\n",
|
| 317 |
+
" <td>Methionine C-11</td>\n",
|
| 318 |
+
" <td>NaN</td>\n",
|
| 319 |
+
" <td>V09IX</td>\n",
|
| 320 |
+
" <td>V09I</td>\n",
|
| 321 |
+
" <td>V09</td>\n",
|
| 322 |
+
" <td>V</td>\n",
|
| 323 |
+
" <td>148.2100</td>\n",
|
| 324 |
+
" <td>-2.20</td>\n",
|
| 325 |
+
" <td>23.90000</td>\n",
|
| 326 |
+
" <td>...</td>\n",
|
| 327 |
+
" <td>4.0</td>\n",
|
| 328 |
+
" <td>63.32</td>\n",
|
| 329 |
+
" <td>9.50</td>\n",
|
| 330 |
+
" <td>0.0</td>\n",
|
| 331 |
+
" <td>148.062484</td>\n",
|
| 332 |
+
" <td>0.0</td>\n",
|
| 333 |
+
" <td>15.54</td>\n",
|
| 334 |
+
" <td>3.0</td>\n",
|
| 335 |
+
" <td>0.0</td>\n",
|
| 336 |
+
" <td>1.0</td>\n",
|
| 337 |
+
" </tr>\n",
|
| 338 |
+
" </tbody>\n",
|
| 339 |
+
"</table>\n",
|
| 340 |
+
"<p>2630 rows × 26 columns</p>\n",
|
| 341 |
+
"</div>"
|
| 342 |
+
],
|
| 343 |
+
"text/plain": [
|
| 344 |
+
" Unnamed: 0 name state level4 level3 level2 level1 \\\n",
|
| 345 |
+
"0 5 Bivalirudin solid B01AE B01A B01 B \n",
|
| 346 |
+
"1 6 Leuprolide solid L02AE L02A L02 L \n",
|
| 347 |
+
"2 13 Goserelin solid L02AE L02A L02 L \n",
|
| 348 |
+
"3 25 Gramicidin D liquid R02AB R02A R02 R \n",
|
| 349 |
+
"4 33 Desmopressin solid H01BA H01B H01 H \n",
|
| 350 |
+
"... ... ... ... ... ... ... ... \n",
|
| 351 |
+
"2625 14553 Belumosudil solid L04AA L04A L04 L \n",
|
| 352 |
+
"2626 14688 Tebipenem pivoxil NaN J01DH J01D J01 J \n",
|
| 353 |
+
"2627 14698 Tosufloxacin NaN J01MA J01M J01 J \n",
|
| 354 |
+
"2628 14931 Linzagolix solid H01CC H01C H01 H \n",
|
| 355 |
+
"2629 14995 Methionine C-11 NaN V09IX V09I V09 V \n",
|
| 356 |
+
"\n",
|
| 357 |
+
" Molecular Weight logP Water Solubility ... Rotatable Bond Count \\\n",
|
| 358 |
+
"0 2180.2853 -14.00 0.04640 ... 66.0 \n",
|
| 359 |
+
"1 1209.3983 -2.40 0.03380 ... 32.0 \n",
|
| 360 |
+
"2 1269.4105 -5.10 0.02830 ... 33.0 \n",
|
| 361 |
+
"3 1811.2530 5.96 0.00390 ... 50.0 \n",
|
| 362 |
+
"4 1069.2200 -6.10 0.11000 ... 19.0 \n",
|
| 363 |
+
"... ... ... ... ... ... \n",
|
| 364 |
+
"2625 452.5180 4.65 0.00289 ... 7.0 \n",
|
| 365 |
+
"2626 497.6300 1.59 0.16700 ... 9.0 \n",
|
| 366 |
+
"2627 404.3490 0.47 0.07620 ... 3.0 \n",
|
| 367 |
+
"2628 508.4200 3.88 0.00198 ... 6.0 \n",
|
| 368 |
+
"2629 148.2100 -2.20 23.90000 ... 4.0 \n",
|
| 369 |
+
"\n",
|
| 370 |
+
" Polar Surface Area (PSA) pKa (strongest basic) Ghose Filter \\\n",
|
| 371 |
+
"0 901.57 11.88 0.0 \n",
|
| 372 |
+
"1 429.04 11.92 0.0 \n",
|
| 373 |
+
"2 495.89 10.91 0.0 \n",
|
| 374 |
+
"3 519.89 NaN 0.0 \n",
|
| 375 |
+
"4 435.41 11.77 0.0 \n",
|
| 376 |
+
"... ... ... ... \n",
|
| 377 |
+
"2625 104.82 4.11 0.0 \n",
|
| 378 |
+
"2626 108.74 6.27 0.0 \n",
|
| 379 |
+
"2627 99.76 9.80 1.0 \n",
|
| 380 |
+
"2628 114.40 -3.50 0.0 \n",
|
| 381 |
+
"2629 63.32 9.50 0.0 \n",
|
| 382 |
+
"\n",
|
| 383 |
+
" Monoisotopic Weight MDDR-Like Rule Polarizability \\\n",
|
| 384 |
+
"0 2178.985813 1.0 218.54 \n",
|
| 385 |
+
"1 1208.645462 1.0 125.24 \n",
|
| 386 |
+
"2 1268.641439 1.0 130.74 \n",
|
| 387 |
+
"3 1810.033419 1.0 194.73 \n",
|
| 388 |
+
"4 1068.426956 1.0 104.78 \n",
|
| 389 |
+
"... ... ... ... \n",
|
| 390 |
+
"2625 452.196074 1.0 49.55 \n",
|
| 391 |
+
"2626 497.165428 1.0 53.39 \n",
|
| 392 |
+
"2627 404.109625 0.0 37.18 \n",
|
| 393 |
+
"2628 508.055206 1.0 45.39 \n",
|
| 394 |
+
"2629 148.062484 0.0 15.54 \n",
|
| 395 |
+
"\n",
|
| 396 |
+
" H Bond Acceptor Count Physiological Charge Rule of Five \n",
|
| 397 |
+
"0 37.0 -4.0 0.0 \n",
|
| 398 |
+
"1 16.0 1.0 0.0 \n",
|
| 399 |
+
"2 18.0 1.0 0.0 \n",
|
| 400 |
+
"3 16.0 0.0 0.0 \n",
|
| 401 |
+
"4 15.0 1.0 0.0 \n",
|
| 402 |
+
"... ... ... ... \n",
|
| 403 |
+
"2625 6.0 0.0 1.0 \n",
|
| 404 |
+
"2626 6.0 0.0 1.0 \n",
|
| 405 |
+
"2627 7.0 0.0 1.0 \n",
|
| 406 |
+
"2628 7.0 -1.0 0.0 \n",
|
| 407 |
+
"2629 3.0 0.0 1.0 \n",
|
| 408 |
+
"\n",
|
| 409 |
+
"[2630 rows x 26 columns]"
|
| 410 |
+
]
|
| 411 |
+
},
|
| 412 |
+
"execution_count": 3,
|
| 413 |
+
"metadata": {},
|
| 414 |
+
"output_type": "execute_result"
|
| 415 |
+
}
|
| 416 |
+
],
|
| 417 |
+
"source": [
|
| 418 |
+
"df_drugs = pd.read_csv('data/filtered_dataset.csv')\n",
|
| 419 |
+
"df_drugs"
|
| 420 |
+
]
|
| 421 |
+
},
|
| 422 |
+
{
|
| 423 |
+
"cell_type": "code",
|
| 424 |
+
"execution_count": 4,
|
| 425 |
+
"metadata": {},
|
| 426 |
+
"outputs": [],
|
| 427 |
+
"source": [
|
| 428 |
+
"def adjacency_matrix(df):\n",
|
| 429 |
+
" # create a matrix of zeros with the same shape as the final adjacency matrix\n",
|
| 430 |
+
" matrix = np.zeros((len(df), len(df)), dtype=int)\n",
|
| 431 |
+
"\n",
|
| 432 |
+
" # loop through each drug and set the corresponding values in the matrix to 1\n",
|
| 433 |
+
" for i, drug in enumerate(df['name']):\n",
|
| 434 |
+
" interacting_drugs = ddi_json[drug]\n",
|
| 435 |
+
" indices = df.index[df['name'].isin(interacting_drugs)].tolist()\n",
|
| 436 |
+
" matrix[i, indices] = 1\n",
|
| 437 |
+
"\n",
|
| 438 |
+
" # convert the matrix to a dataframe and set the column names and index\n",
|
| 439 |
+
" df_matrix = pd.DataFrame(matrix, columns=df['name'], index=df['name'])\n",
|
| 440 |
+
"\n",
|
| 441 |
+
" return df_matrix\n",
|
| 442 |
+
"\n",
|
| 443 |
+
"df_matrix = adjacency_matrix(df_drugs)"
|
| 444 |
+
]
|
| 445 |
+
},
|
| 446 |
+
{
|
| 447 |
+
"cell_type": "code",
|
| 448 |
+
"execution_count": 5,
|
| 449 |
+
"metadata": {},
|
| 450 |
+
"outputs": [
|
| 451 |
+
{
|
| 452 |
+
"data": {
|
| 453 |
+
"text/html": [
|
| 454 |
+
"<div>\n",
|
| 455 |
+
"<style scoped>\n",
|
| 456 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
| 457 |
+
" vertical-align: middle;\n",
|
| 458 |
+
" }\n",
|
| 459 |
+
"\n",
|
| 460 |
+
" .dataframe tbody tr th {\n",
|
| 461 |
+
" vertical-align: top;\n",
|
| 462 |
+
" }\n",
|
| 463 |
+
"\n",
|
| 464 |
+
" .dataframe thead th {\n",
|
| 465 |
+
" text-align: right;\n",
|
| 466 |
+
" }\n",
|
| 467 |
+
"</style>\n",
|
| 468 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
| 469 |
+
" <thead>\n",
|
| 470 |
+
" <tr style=\"text-align: right;\">\n",
|
| 471 |
+
" <th>name</th>\n",
|
| 472 |
+
" <th>Bivalirudin</th>\n",
|
| 473 |
+
" <th>Leuprolide</th>\n",
|
| 474 |
+
" <th>Goserelin</th>\n",
|
| 475 |
+
" <th>Gramicidin D</th>\n",
|
| 476 |
+
" <th>Desmopressin</th>\n",
|
| 477 |
+
" <th>Cetrorelix</th>\n",
|
| 478 |
+
" <th>Daptomycin</th>\n",
|
| 479 |
+
" <th>Abarelix</th>\n",
|
| 480 |
+
" <th>Pyridoxal phosphate</th>\n",
|
| 481 |
+
" <th>Cyanocobalamin</th>\n",
|
| 482 |
+
" <th>...</th>\n",
|
| 483 |
+
" <th>Naphthoquine</th>\n",
|
| 484 |
+
" <th>Odevixibat</th>\n",
|
| 485 |
+
" <th>Melphalan flufenamide</th>\n",
|
| 486 |
+
" <th>Deucravacitinib</th>\n",
|
| 487 |
+
" <th>Tegoprazan</th>\n",
|
| 488 |
+
" <th>Belumosudil</th>\n",
|
| 489 |
+
" <th>Tebipenem pivoxil</th>\n",
|
| 490 |
+
" <th>Tosufloxacin</th>\n",
|
| 491 |
+
" <th>Linzagolix</th>\n",
|
| 492 |
+
" <th>Methionine C-11</th>\n",
|
| 493 |
+
" </tr>\n",
|
| 494 |
+
" <tr>\n",
|
| 495 |
+
" <th>name</th>\n",
|
| 496 |
+
" <th></th>\n",
|
| 497 |
+
" <th></th>\n",
|
| 498 |
+
" <th></th>\n",
|
| 499 |
+
" <th></th>\n",
|
| 500 |
+
" <th></th>\n",
|
| 501 |
+
" <th></th>\n",
|
| 502 |
+
" <th></th>\n",
|
| 503 |
+
" <th></th>\n",
|
| 504 |
+
" <th></th>\n",
|
| 505 |
+
" <th></th>\n",
|
| 506 |
+
" <th></th>\n",
|
| 507 |
+
" <th></th>\n",
|
| 508 |
+
" <th></th>\n",
|
| 509 |
+
" <th></th>\n",
|
| 510 |
+
" <th></th>\n",
|
| 511 |
+
" <th></th>\n",
|
| 512 |
+
" <th></th>\n",
|
| 513 |
+
" <th></th>\n",
|
| 514 |
+
" <th></th>\n",
|
| 515 |
+
" <th></th>\n",
|
| 516 |
+
" <th></th>\n",
|
| 517 |
+
" </tr>\n",
|
| 518 |
+
" </thead>\n",
|
| 519 |
+
" <tbody>\n",
|
| 520 |
+
" <tr>\n",
|
| 521 |
+
" <th>Bivalirudin</th>\n",
|
| 522 |
+
" <td>0</td>\n",
|
| 523 |
+
" <td>0</td>\n",
|
| 524 |
+
" <td>0</td>\n",
|
| 525 |
+
" <td>0</td>\n",
|
| 526 |
+
" <td>0</td>\n",
|
| 527 |
+
" <td>0</td>\n",
|
| 528 |
+
" <td>0</td>\n",
|
| 529 |
+
" <td>0</td>\n",
|
| 530 |
+
" <td>0</td>\n",
|
| 531 |
+
" <td>0</td>\n",
|
| 532 |
+
" <td>...</td>\n",
|
| 533 |
+
" <td>0</td>\n",
|
| 534 |
+
" <td>0</td>\n",
|
| 535 |
+
" <td>0</td>\n",
|
| 536 |
+
" <td>0</td>\n",
|
| 537 |
+
" <td>0</td>\n",
|
| 538 |
+
" <td>0</td>\n",
|
| 539 |
+
" <td>0</td>\n",
|
| 540 |
+
" <td>0</td>\n",
|
| 541 |
+
" <td>0</td>\n",
|
| 542 |
+
" <td>0</td>\n",
|
| 543 |
+
" </tr>\n",
|
| 544 |
+
" <tr>\n",
|
| 545 |
+
" <th>Leuprolide</th>\n",
|
| 546 |
+
" <td>0</td>\n",
|
| 547 |
+
" <td>0</td>\n",
|
| 548 |
+
" <td>1</td>\n",
|
| 549 |
+
" <td>0</td>\n",
|
| 550 |
+
" <td>1</td>\n",
|
| 551 |
+
" <td>0</td>\n",
|
| 552 |
+
" <td>1</td>\n",
|
| 553 |
+
" <td>0</td>\n",
|
| 554 |
+
" <td>0</td>\n",
|
| 555 |
+
" <td>1</td>\n",
|
| 556 |
+
" <td>...</td>\n",
|
| 557 |
+
" <td>0</td>\n",
|
| 558 |
+
" <td>0</td>\n",
|
| 559 |
+
" <td>0</td>\n",
|
| 560 |
+
" <td>0</td>\n",
|
| 561 |
+
" <td>0</td>\n",
|
| 562 |
+
" <td>0</td>\n",
|
| 563 |
+
" <td>0</td>\n",
|
| 564 |
+
" <td>0</td>\n",
|
| 565 |
+
" <td>0</td>\n",
|
| 566 |
+
" <td>0</td>\n",
|
| 567 |
+
" </tr>\n",
|
| 568 |
+
" <tr>\n",
|
| 569 |
+
" <th>Goserelin</th>\n",
|
| 570 |
+
" <td>0</td>\n",
|
| 571 |
+
" <td>1</td>\n",
|
| 572 |
+
" <td>0</td>\n",
|
| 573 |
+
" <td>0</td>\n",
|
| 574 |
+
" <td>1</td>\n",
|
| 575 |
+
" <td>0</td>\n",
|
| 576 |
+
" <td>1</td>\n",
|
| 577 |
+
" <td>0</td>\n",
|
| 578 |
+
" <td>0</td>\n",
|
| 579 |
+
" <td>1</td>\n",
|
| 580 |
+
" <td>...</td>\n",
|
| 581 |
+
" <td>0</td>\n",
|
| 582 |
+
" <td>0</td>\n",
|
| 583 |
+
" <td>0</td>\n",
|
| 584 |
+
" <td>0</td>\n",
|
| 585 |
+
" <td>0</td>\n",
|
| 586 |
+
" <td>0</td>\n",
|
| 587 |
+
" <td>0</td>\n",
|
| 588 |
+
" <td>0</td>\n",
|
| 589 |
+
" <td>0</td>\n",
|
| 590 |
+
" <td>0</td>\n",
|
| 591 |
+
" </tr>\n",
|
| 592 |
+
" <tr>\n",
|
| 593 |
+
" <th>Gramicidin D</th>\n",
|
| 594 |
+
" <td>0</td>\n",
|
| 595 |
+
" <td>0</td>\n",
|
| 596 |
+
" <td>0</td>\n",
|
| 597 |
+
" <td>0</td>\n",
|
| 598 |
+
" <td>0</td>\n",
|
| 599 |
+
" <td>0</td>\n",
|
| 600 |
+
" <td>0</td>\n",
|
| 601 |
+
" <td>0</td>\n",
|
| 602 |
+
" <td>0</td>\n",
|
| 603 |
+
" <td>0</td>\n",
|
| 604 |
+
" <td>...</td>\n",
|
| 605 |
+
" <td>0</td>\n",
|
| 606 |
+
" <td>0</td>\n",
|
| 607 |
+
" <td>0</td>\n",
|
| 608 |
+
" <td>0</td>\n",
|
| 609 |
+
" <td>0</td>\n",
|
| 610 |
+
" <td>0</td>\n",
|
| 611 |
+
" <td>0</td>\n",
|
| 612 |
+
" <td>0</td>\n",
|
| 613 |
+
" <td>0</td>\n",
|
| 614 |
+
" <td>0</td>\n",
|
| 615 |
+
" </tr>\n",
|
| 616 |
+
" <tr>\n",
|
| 617 |
+
" <th>Desmopressin</th>\n",
|
| 618 |
+
" <td>0</td>\n",
|
| 619 |
+
" <td>1</td>\n",
|
| 620 |
+
" <td>1</td>\n",
|
| 621 |
+
" <td>0</td>\n",
|
| 622 |
+
" <td>0</td>\n",
|
| 623 |
+
" <td>0</td>\n",
|
| 624 |
+
" <td>1</td>\n",
|
| 625 |
+
" <td>0</td>\n",
|
| 626 |
+
" <td>0</td>\n",
|
| 627 |
+
" <td>1</td>\n",
|
| 628 |
+
" <td>...</td>\n",
|
| 629 |
+
" <td>0</td>\n",
|
| 630 |
+
" <td>0</td>\n",
|
| 631 |
+
" <td>0</td>\n",
|
| 632 |
+
" <td>0</td>\n",
|
| 633 |
+
" <td>0</td>\n",
|
| 634 |
+
" <td>0</td>\n",
|
| 635 |
+
" <td>0</td>\n",
|
| 636 |
+
" <td>0</td>\n",
|
| 637 |
+
" <td>0</td>\n",
|
| 638 |
+
" <td>0</td>\n",
|
| 639 |
+
" </tr>\n",
|
| 640 |
+
" <tr>\n",
|
| 641 |
+
" <th>...</th>\n",
|
| 642 |
+
" <td>...</td>\n",
|
| 643 |
+
" <td>...</td>\n",
|
| 644 |
+
" <td>...</td>\n",
|
| 645 |
+
" <td>...</td>\n",
|
| 646 |
+
" <td>...</td>\n",
|
| 647 |
+
" <td>...</td>\n",
|
| 648 |
+
" <td>...</td>\n",
|
| 649 |
+
" <td>...</td>\n",
|
| 650 |
+
" <td>...</td>\n",
|
| 651 |
+
" <td>...</td>\n",
|
| 652 |
+
" <td>...</td>\n",
|
| 653 |
+
" <td>...</td>\n",
|
| 654 |
+
" <td>...</td>\n",
|
| 655 |
+
" <td>...</td>\n",
|
| 656 |
+
" <td>...</td>\n",
|
| 657 |
+
" <td>...</td>\n",
|
| 658 |
+
" <td>...</td>\n",
|
| 659 |
+
" <td>...</td>\n",
|
| 660 |
+
" <td>...</td>\n",
|
| 661 |
+
" <td>...</td>\n",
|
| 662 |
+
" <td>...</td>\n",
|
| 663 |
+
" </tr>\n",
|
| 664 |
+
" <tr>\n",
|
| 665 |
+
" <th>Belumosudil</th>\n",
|
| 666 |
+
" <td>0</td>\n",
|
| 667 |
+
" <td>0</td>\n",
|
| 668 |
+
" <td>0</td>\n",
|
| 669 |
+
" <td>0</td>\n",
|
| 670 |
+
" <td>0</td>\n",
|
| 671 |
+
" <td>0</td>\n",
|
| 672 |
+
" <td>1</td>\n",
|
| 673 |
+
" <td>0</td>\n",
|
| 674 |
+
" <td>0</td>\n",
|
| 675 |
+
" <td>0</td>\n",
|
| 676 |
+
" <td>...</td>\n",
|
| 677 |
+
" <td>0</td>\n",
|
| 678 |
+
" <td>0</td>\n",
|
| 679 |
+
" <td>0</td>\n",
|
| 680 |
+
" <td>1</td>\n",
|
| 681 |
+
" <td>0</td>\n",
|
| 682 |
+
" <td>0</td>\n",
|
| 683 |
+
" <td>0</td>\n",
|
| 684 |
+
" <td>0</td>\n",
|
| 685 |
+
" <td>0</td>\n",
|
| 686 |
+
" <td>0</td>\n",
|
| 687 |
+
" </tr>\n",
|
| 688 |
+
" <tr>\n",
|
| 689 |
+
" <th>Tebipenem pivoxil</th>\n",
|
| 690 |
+
" <td>0</td>\n",
|
| 691 |
+
" <td>0</td>\n",
|
| 692 |
+
" <td>0</td>\n",
|
| 693 |
+
" <td>0</td>\n",
|
| 694 |
+
" <td>0</td>\n",
|
| 695 |
+
" <td>0</td>\n",
|
| 696 |
+
" <td>0</td>\n",
|
| 697 |
+
" <td>0</td>\n",
|
| 698 |
+
" <td>0</td>\n",
|
| 699 |
+
" <td>0</td>\n",
|
| 700 |
+
" <td>...</td>\n",
|
| 701 |
+
" <td>0</td>\n",
|
| 702 |
+
" <td>0</td>\n",
|
| 703 |
+
" <td>0</td>\n",
|
| 704 |
+
" <td>0</td>\n",
|
| 705 |
+
" <td>0</td>\n",
|
| 706 |
+
" <td>0</td>\n",
|
| 707 |
+
" <td>0</td>\n",
|
| 708 |
+
" <td>0</td>\n",
|
| 709 |
+
" <td>0</td>\n",
|
| 710 |
+
" <td>0</td>\n",
|
| 711 |
+
" </tr>\n",
|
| 712 |
+
" <tr>\n",
|
| 713 |
+
" <th>Tosufloxacin</th>\n",
|
| 714 |
+
" <td>0</td>\n",
|
| 715 |
+
" <td>0</td>\n",
|
| 716 |
+
" <td>0</td>\n",
|
| 717 |
+
" <td>0</td>\n",
|
| 718 |
+
" <td>0</td>\n",
|
| 719 |
+
" <td>0</td>\n",
|
| 720 |
+
" <td>0</td>\n",
|
| 721 |
+
" <td>0</td>\n",
|
| 722 |
+
" <td>0</td>\n",
|
| 723 |
+
" <td>0</td>\n",
|
| 724 |
+
" <td>...</td>\n",
|
| 725 |
+
" <td>0</td>\n",
|
| 726 |
+
" <td>0</td>\n",
|
| 727 |
+
" <td>0</td>\n",
|
| 728 |
+
" <td>0</td>\n",
|
| 729 |
+
" <td>0</td>\n",
|
| 730 |
+
" <td>0</td>\n",
|
| 731 |
+
" <td>0</td>\n",
|
| 732 |
+
" <td>0</td>\n",
|
| 733 |
+
" <td>0</td>\n",
|
| 734 |
+
" <td>0</td>\n",
|
| 735 |
+
" </tr>\n",
|
| 736 |
+
" <tr>\n",
|
| 737 |
+
" <th>Linzagolix</th>\n",
|
| 738 |
+
" <td>0</td>\n",
|
| 739 |
+
" <td>0</td>\n",
|
| 740 |
+
" <td>0</td>\n",
|
| 741 |
+
" <td>0</td>\n",
|
| 742 |
+
" <td>0</td>\n",
|
| 743 |
+
" <td>0</td>\n",
|
| 744 |
+
" <td>0</td>\n",
|
| 745 |
+
" <td>0</td>\n",
|
| 746 |
+
" <td>0</td>\n",
|
| 747 |
+
" <td>0</td>\n",
|
| 748 |
+
" <td>...</td>\n",
|
| 749 |
+
" <td>0</td>\n",
|
| 750 |
+
" <td>0</td>\n",
|
| 751 |
+
" <td>0</td>\n",
|
| 752 |
+
" <td>0</td>\n",
|
| 753 |
+
" <td>0</td>\n",
|
| 754 |
+
" <td>0</td>\n",
|
| 755 |
+
" <td>0</td>\n",
|
| 756 |
+
" <td>0</td>\n",
|
| 757 |
+
" <td>0</td>\n",
|
| 758 |
+
" <td>0</td>\n",
|
| 759 |
+
" </tr>\n",
|
| 760 |
+
" <tr>\n",
|
| 761 |
+
" <th>Methionine C-11</th>\n",
|
| 762 |
+
" <td>0</td>\n",
|
| 763 |
+
" <td>0</td>\n",
|
| 764 |
+
" <td>0</td>\n",
|
| 765 |
+
" <td>0</td>\n",
|
| 766 |
+
" <td>0</td>\n",
|
| 767 |
+
" <td>0</td>\n",
|
| 768 |
+
" <td>0</td>\n",
|
| 769 |
+
" <td>0</td>\n",
|
| 770 |
+
" <td>0</td>\n",
|
| 771 |
+
" <td>0</td>\n",
|
| 772 |
+
" <td>...</td>\n",
|
| 773 |
+
" <td>0</td>\n",
|
| 774 |
+
" <td>0</td>\n",
|
| 775 |
+
" <td>0</td>\n",
|
| 776 |
+
" <td>0</td>\n",
|
| 777 |
+
" <td>0</td>\n",
|
| 778 |
+
" <td>0</td>\n",
|
| 779 |
+
" <td>0</td>\n",
|
| 780 |
+
" <td>0</td>\n",
|
| 781 |
+
" <td>0</td>\n",
|
| 782 |
+
" <td>0</td>\n",
|
| 783 |
+
" </tr>\n",
|
| 784 |
+
" </tbody>\n",
|
| 785 |
+
"</table>\n",
|
| 786 |
+
"<p>2630 rows × 2630 columns</p>\n",
|
| 787 |
+
"</div>"
|
| 788 |
+
],
|
| 789 |
+
"text/plain": [
|
| 790 |
+
"name Bivalirudin Leuprolide Goserelin Gramicidin D \\\n",
|
| 791 |
+
"name \n",
|
| 792 |
+
"Bivalirudin 0 0 0 0 \n",
|
| 793 |
+
"Leuprolide 0 0 1 0 \n",
|
| 794 |
+
"Goserelin 0 1 0 0 \n",
|
| 795 |
+
"Gramicidin D 0 0 0 0 \n",
|
| 796 |
+
"Desmopressin 0 1 1 0 \n",
|
| 797 |
+
"... ... ... ... ... \n",
|
| 798 |
+
"Belumosudil 0 0 0 0 \n",
|
| 799 |
+
"Tebipenem pivoxil 0 0 0 0 \n",
|
| 800 |
+
"Tosufloxacin 0 0 0 0 \n",
|
| 801 |
+
"Linzagolix 0 0 0 0 \n",
|
| 802 |
+
"Methionine C-11 0 0 0 0 \n",
|
| 803 |
+
"\n",
|
| 804 |
+
"name Desmopressin Cetrorelix Daptomycin Abarelix \\\n",
|
| 805 |
+
"name \n",
|
| 806 |
+
"Bivalirudin 0 0 0 0 \n",
|
| 807 |
+
"Leuprolide 1 0 1 0 \n",
|
| 808 |
+
"Goserelin 1 0 1 0 \n",
|
| 809 |
+
"Gramicidin D 0 0 0 0 \n",
|
| 810 |
+
"Desmopressin 0 0 1 0 \n",
|
| 811 |
+
"... ... ... ... ... \n",
|
| 812 |
+
"Belumosudil 0 0 1 0 \n",
|
| 813 |
+
"Tebipenem pivoxil 0 0 0 0 \n",
|
| 814 |
+
"Tosufloxacin 0 0 0 0 \n",
|
| 815 |
+
"Linzagolix 0 0 0 0 \n",
|
| 816 |
+
"Methionine C-11 0 0 0 0 \n",
|
| 817 |
+
"\n",
|
| 818 |
+
"name Pyridoxal phosphate Cyanocobalamin ... Naphthoquine \\\n",
|
| 819 |
+
"name ... \n",
|
| 820 |
+
"Bivalirudin 0 0 ... 0 \n",
|
| 821 |
+
"Leuprolide 0 1 ... 0 \n",
|
| 822 |
+
"Goserelin 0 1 ... 0 \n",
|
| 823 |
+
"Gramicidin D 0 0 ... 0 \n",
|
| 824 |
+
"Desmopressin 0 1 ... 0 \n",
|
| 825 |
+
"... ... ... ... ... \n",
|
| 826 |
+
"Belumosudil 0 0 ... 0 \n",
|
| 827 |
+
"Tebipenem pivoxil 0 0 ... 0 \n",
|
| 828 |
+
"Tosufloxacin 0 0 ... 0 \n",
|
| 829 |
+
"Linzagolix 0 0 ... 0 \n",
|
| 830 |
+
"Methionine C-11 0 0 ... 0 \n",
|
| 831 |
+
"\n",
|
| 832 |
+
"name Odevixibat Melphalan flufenamide Deucravacitinib \\\n",
|
| 833 |
+
"name \n",
|
| 834 |
+
"Bivalirudin 0 0 0 \n",
|
| 835 |
+
"Leuprolide 0 0 0 \n",
|
| 836 |
+
"Goserelin 0 0 0 \n",
|
| 837 |
+
"Gramicidin D 0 0 0 \n",
|
| 838 |
+
"Desmopressin 0 0 0 \n",
|
| 839 |
+
"... ... ... ... \n",
|
| 840 |
+
"Belumosudil 0 0 1 \n",
|
| 841 |
+
"Tebipenem pivoxil 0 0 0 \n",
|
| 842 |
+
"Tosufloxacin 0 0 0 \n",
|
| 843 |
+
"Linzagolix 0 0 0 \n",
|
| 844 |
+
"Methionine C-11 0 0 0 \n",
|
| 845 |
+
"\n",
|
| 846 |
+
"name Tegoprazan Belumosudil Tebipenem pivoxil Tosufloxacin \\\n",
|
| 847 |
+
"name \n",
|
| 848 |
+
"Bivalirudin 0 0 0 0 \n",
|
| 849 |
+
"Leuprolide 0 0 0 0 \n",
|
| 850 |
+
"Goserelin 0 0 0 0 \n",
|
| 851 |
+
"Gramicidin D 0 0 0 0 \n",
|
| 852 |
+
"Desmopressin 0 0 0 0 \n",
|
| 853 |
+
"... ... ... ... ... \n",
|
| 854 |
+
"Belumosudil 0 0 0 0 \n",
|
| 855 |
+
"Tebipenem pivoxil 0 0 0 0 \n",
|
| 856 |
+
"Tosufloxacin 0 0 0 0 \n",
|
| 857 |
+
"Linzagolix 0 0 0 0 \n",
|
| 858 |
+
"Methionine C-11 0 0 0 0 \n",
|
| 859 |
+
"\n",
|
| 860 |
+
"name Linzagolix Methionine C-11 \n",
|
| 861 |
+
"name \n",
|
| 862 |
+
"Bivalirudin 0 0 \n",
|
| 863 |
+
"Leuprolide 0 0 \n",
|
| 864 |
+
"Goserelin 0 0 \n",
|
| 865 |
+
"Gramicidin D 0 0 \n",
|
| 866 |
+
"Desmopressin 0 0 \n",
|
| 867 |
+
"... ... ... \n",
|
| 868 |
+
"Belumosudil 0 0 \n",
|
| 869 |
+
"Tebipenem pivoxil 0 0 \n",
|
| 870 |
+
"Tosufloxacin 0 0 \n",
|
| 871 |
+
"Linzagolix 0 0 \n",
|
| 872 |
+
"Methionine C-11 0 0 \n",
|
| 873 |
+
"\n",
|
| 874 |
+
"[2630 rows x 2630 columns]"
|
| 875 |
+
]
|
| 876 |
+
},
|
| 877 |
+
"execution_count": 5,
|
| 878 |
+
"metadata": {},
|
| 879 |
+
"output_type": "execute_result"
|
| 880 |
+
}
|
| 881 |
+
],
|
| 882 |
+
"source": [
|
| 883 |
+
"df_matrix"
|
| 884 |
+
]
|
| 885 |
+
},
|
| 886 |
+
{
|
| 887 |
+
"cell_type": "code",
|
| 888 |
+
"execution_count": 6,
|
| 889 |
+
"metadata": {},
|
| 890 |
+
"outputs": [],
|
| 891 |
+
"source": [
|
| 892 |
+
"import itertools\n",
|
| 893 |
+
"def random_drug_pairs(names, p, random_state=None):\n",
|
| 894 |
+
" \"\"\"\n",
|
| 895 |
+
" Selects p% of all possible pairs of names selected at random from a pandas series of names.\n",
|
| 896 |
+
" \n",
|
| 897 |
+
" Parameters:\n",
|
| 898 |
+
" names (pandas.Series): A pandas series of names.\n",
|
| 899 |
+
" p (float): The percentage of pairs to select (between 0 and 1).\n",
|
| 900 |
+
" random_state (int, optional): Seed for the random number generator.\n",
|
| 901 |
+
" \n",
|
| 902 |
+
" Returns:\n",
|
| 903 |
+
" pandas.Series: A pandas series of selected pairs of names.\n",
|
| 904 |
+
" \"\"\"\n",
|
| 905 |
+
" # Calculate the total number of possible pairs\n",
|
| 906 |
+
" num_pairs = int(len(names) * (len(names) - 1) / 2)\n",
|
| 907 |
+
"\n",
|
| 908 |
+
" # Calculate the number of pairs to select\n",
|
| 909 |
+
" num_selected_pairs = int(p * num_pairs)\n",
|
| 910 |
+
"\n",
|
| 911 |
+
" # Generate all possible pairs of names using itertools\n",
|
| 912 |
+
" all_pairs = list(itertools.combinations(names, 2))\n",
|
| 913 |
+
"\n",
|
| 914 |
+
" # Select a random subset of pairs\n",
|
| 915 |
+
" selected_pairs = pd.Series(all_pairs).sample(n=num_selected_pairs, random_state=random_state)\n",
|
| 916 |
+
"\n",
|
| 917 |
+
" # Return the selected pairs\n",
|
| 918 |
+
" return selected_pairs\n",
|
| 919 |
+
"\n",
|
| 920 |
+
"def exclude_pairs_from_adjacency_matrix(df_matrix, excluded_pairs):\n",
|
| 921 |
+
" col_inds1 = np.array([df_matrix.columns.get_loc(drug1) for drug1, _ in excluded_pairs])\n",
|
| 922 |
+
" col_inds2 = np.array([df_matrix.columns.get_loc(drug2) for _, drug2 in excluded_pairs])\n",
|
| 923 |
+
" values = df_matrix.values[col_inds1, col_inds2]\n",
|
| 924 |
+
" df_matrix.values[col_inds1, col_inds2] = 0\n",
|
| 925 |
+
" df_matrix.values[col_inds2, col_inds1] = 0\n",
|
| 926 |
+
" return values\n",
|
| 927 |
+
"\n",
|
| 928 |
+
"def get_pairs_from_adjacency_matrix(df_matrix, excluded_pairs):\n",
|
| 929 |
+
" col_inds1 = np.array([df_matrix.columns.get_loc(drug1) for drug1, _ in excluded_pairs])\n",
|
| 930 |
+
" col_inds2 = np.array([df_matrix.columns.get_loc(drug2) for _, drug2 in excluded_pairs])\n",
|
| 931 |
+
" values = df_matrix.values[col_inds1, col_inds2]\n",
|
| 932 |
+
" return values"
|
| 933 |
+
]
|
| 934 |
+
},
|
| 935 |
+
{
|
| 936 |
+
"cell_type": "code",
|
| 937 |
+
"execution_count": 7,
|
| 938 |
+
"metadata": {},
|
| 939 |
+
"outputs": [],
|
| 940 |
+
"source": [
|
| 941 |
+
"# train, test split \n",
|
| 942 |
+
"# train set : remove selected pairs from adj matrix\n",
|
| 943 |
+
"# test set: the excluded pairs\n",
|
| 944 |
+
"excluded_pairs = random_drug_pairs(df_drugs['name'], 0.20, 42)\n",
|
| 945 |
+
"excluded_pair_values = exclude_pairs_from_adjacency_matrix(df_matrix, excluded_pairs)"
|
| 946 |
+
]
|
| 947 |
+
},
|
| 948 |
+
{
|
| 949 |
+
"cell_type": "code",
|
| 950 |
+
"execution_count": 8,
|
| 951 |
+
"metadata": {},
|
| 952 |
+
"outputs": [
|
| 953 |
+
{
|
| 954 |
+
"name": "stderr",
|
| 955 |
+
"output_type": "stream",
|
| 956 |
+
"text": [
|
| 957 |
+
"c:\\Users\\Georg\\anaconda3\\lib\\site-packages\\sknetwork\\utils\\check.py:216: Warning: The number of neighbors must be lower than the number of nodes with known labels. Changed accordingly.\n",
|
| 958 |
+
" warnings.warn(Warning(\"The number of neighbors must be lower than the number of nodes with known labels. \"\n"
|
| 959 |
+
]
|
| 960 |
+
}
|
| 961 |
+
],
|
| 962 |
+
"source": [
|
| 963 |
+
"from sknetwork.linkpred import NNLinker\n",
|
| 964 |
+
"from sknetwork.visualization import svg_graph, svg_bigraph\n",
|
| 965 |
+
"linker = NNLinker(n_neighbors=2630, threshold=0)\n",
|
| 966 |
+
"links = linker.fit_predict(df_matrix.to_numpy())"
|
| 967 |
+
]
|
| 968 |
+
},
|
| 969 |
+
{
|
| 970 |
+
"cell_type": "code",
|
| 971 |
+
"execution_count": 9,
|
| 972 |
+
"metadata": {},
|
| 973 |
+
"outputs": [
|
| 974 |
+
{
|
| 975 |
+
"data": {
|
| 976 |
+
"text/html": [
|
| 977 |
+
"<div>\n",
|
| 978 |
+
"<style scoped>\n",
|
| 979 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
| 980 |
+
" vertical-align: middle;\n",
|
| 981 |
+
" }\n",
|
| 982 |
+
"\n",
|
| 983 |
+
" .dataframe tbody tr th {\n",
|
| 984 |
+
" vertical-align: top;\n",
|
| 985 |
+
" }\n",
|
| 986 |
+
"\n",
|
| 987 |
+
" .dataframe thead th {\n",
|
| 988 |
+
" text-align: right;\n",
|
| 989 |
+
" }\n",
|
| 990 |
+
"</style>\n",
|
| 991 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
| 992 |
+
" <thead>\n",
|
| 993 |
+
" <tr style=\"text-align: right;\">\n",
|
| 994 |
+
" <th>name</th>\n",
|
| 995 |
+
" <th>Bivalirudin</th>\n",
|
| 996 |
+
" <th>Leuprolide</th>\n",
|
| 997 |
+
" <th>Goserelin</th>\n",
|
| 998 |
+
" <th>Gramicidin D</th>\n",
|
| 999 |
+
" <th>Desmopressin</th>\n",
|
| 1000 |
+
" <th>Cetrorelix</th>\n",
|
| 1001 |
+
" <th>Daptomycin</th>\n",
|
| 1002 |
+
" <th>Abarelix</th>\n",
|
| 1003 |
+
" <th>Pyridoxal phosphate</th>\n",
|
| 1004 |
+
" <th>Cyanocobalamin</th>\n",
|
| 1005 |
+
" <th>...</th>\n",
|
| 1006 |
+
" <th>Naphthoquine</th>\n",
|
| 1007 |
+
" <th>Odevixibat</th>\n",
|
| 1008 |
+
" <th>Melphalan flufenamide</th>\n",
|
| 1009 |
+
" <th>Deucravacitinib</th>\n",
|
| 1010 |
+
" <th>Tegoprazan</th>\n",
|
| 1011 |
+
" <th>Belumosudil</th>\n",
|
| 1012 |
+
" <th>Tebipenem pivoxil</th>\n",
|
| 1013 |
+
" <th>Tosufloxacin</th>\n",
|
| 1014 |
+
" <th>Linzagolix</th>\n",
|
| 1015 |
+
" <th>Methionine C-11</th>\n",
|
| 1016 |
+
" </tr>\n",
|
| 1017 |
+
" </thead>\n",
|
| 1018 |
+
" <tbody>\n",
|
| 1019 |
+
" <tr>\n",
|
| 1020 |
+
" <th>0</th>\n",
|
| 1021 |
+
" <td>1.000000</td>\n",
|
| 1022 |
+
" <td>0.306150</td>\n",
|
| 1023 |
+
" <td>0.288741</td>\n",
|
| 1024 |
+
" <td>0.083090</td>\n",
|
| 1025 |
+
" <td>0.310237</td>\n",
|
| 1026 |
+
" <td>0.0</td>\n",
|
| 1027 |
+
" <td>0.338457</td>\n",
|
| 1028 |
+
" <td>0.0</td>\n",
|
| 1029 |
+
" <td>0.0</td>\n",
|
| 1030 |
+
" <td>0.430352</td>\n",
|
| 1031 |
+
" <td>...</td>\n",
|
| 1032 |
+
" <td>0.0</td>\n",
|
| 1033 |
+
" <td>0.0</td>\n",
|
| 1034 |
+
" <td>0.0</td>\n",
|
| 1035 |
+
" <td>0.305532</td>\n",
|
| 1036 |
+
" <td>0.0</td>\n",
|
| 1037 |
+
" <td>0.241866</td>\n",
|
| 1038 |
+
" <td>0.0</td>\n",
|
| 1039 |
+
" <td>0.0</td>\n",
|
| 1040 |
+
" <td>0.105519</td>\n",
|
| 1041 |
+
" <td>0.0</td>\n",
|
| 1042 |
+
" </tr>\n",
|
| 1043 |
+
" <tr>\n",
|
| 1044 |
+
" <th>1</th>\n",
|
| 1045 |
+
" <td>0.306150</td>\n",
|
| 1046 |
+
" <td>1.000000</td>\n",
|
| 1047 |
+
" <td>0.780471</td>\n",
|
| 1048 |
+
" <td>0.087487</td>\n",
|
| 1049 |
+
" <td>0.587482</td>\n",
|
| 1050 |
+
" <td>0.0</td>\n",
|
| 1051 |
+
" <td>0.645092</td>\n",
|
| 1052 |
+
" <td>0.0</td>\n",
|
| 1053 |
+
" <td>0.0</td>\n",
|
| 1054 |
+
" <td>0.601979</td>\n",
|
| 1055 |
+
" <td>...</td>\n",
|
| 1056 |
+
" <td>0.0</td>\n",
|
| 1057 |
+
" <td>0.0</td>\n",
|
| 1058 |
+
" <td>0.0</td>\n",
|
| 1059 |
+
" <td>0.167036</td>\n",
|
| 1060 |
+
" <td>0.0</td>\n",
|
| 1061 |
+
" <td>0.242198</td>\n",
|
| 1062 |
+
" <td>0.0</td>\n",
|
| 1063 |
+
" <td>0.0</td>\n",
|
| 1064 |
+
" <td>0.156851</td>\n",
|
| 1065 |
+
" <td>0.0</td>\n",
|
| 1066 |
+
" </tr>\n",
|
| 1067 |
+
" <tr>\n",
|
| 1068 |
+
" <th>2</th>\n",
|
| 1069 |
+
" <td>0.288741</td>\n",
|
| 1070 |
+
" <td>0.780471</td>\n",
|
| 1071 |
+
" <td>1.000000</td>\n",
|
| 1072 |
+
" <td>0.110648</td>\n",
|
| 1073 |
+
" <td>0.603694</td>\n",
|
| 1074 |
+
" <td>0.0</td>\n",
|
| 1075 |
+
" <td>0.614604</td>\n",
|
| 1076 |
+
" <td>0.0</td>\n",
|
| 1077 |
+
" <td>0.0</td>\n",
|
| 1078 |
+
" <td>0.608478</td>\n",
|
| 1079 |
+
" <td>...</td>\n",
|
| 1080 |
+
" <td>0.0</td>\n",
|
| 1081 |
+
" <td>0.0</td>\n",
|
| 1082 |
+
" <td>0.0</td>\n",
|
| 1083 |
+
" <td>0.123966</td>\n",
|
| 1084 |
+
" <td>0.0</td>\n",
|
| 1085 |
+
" <td>0.217510</td>\n",
|
| 1086 |
+
" <td>0.0</td>\n",
|
| 1087 |
+
" <td>0.0</td>\n",
|
| 1088 |
+
" <td>0.123984</td>\n",
|
| 1089 |
+
" <td>0.0</td>\n",
|
| 1090 |
+
" </tr>\n",
|
| 1091 |
+
" <tr>\n",
|
| 1092 |
+
" <th>3</th>\n",
|
| 1093 |
+
" <td>0.083090</td>\n",
|
| 1094 |
+
" <td>0.087487</td>\n",
|
| 1095 |
+
" <td>0.110648</td>\n",
|
| 1096 |
+
" <td>1.000000</td>\n",
|
| 1097 |
+
" <td>0.028047</td>\n",
|
| 1098 |
+
" <td>0.0</td>\n",
|
| 1099 |
+
" <td>0.190443</td>\n",
|
| 1100 |
+
" <td>0.0</td>\n",
|
| 1101 |
+
" <td>0.0</td>\n",
|
| 1102 |
+
" <td>0.032498</td>\n",
|
| 1103 |
+
" <td>...</td>\n",
|
| 1104 |
+
" <td>0.0</td>\n",
|
| 1105 |
+
" <td>0.0</td>\n",
|
| 1106 |
+
" <td>0.0</td>\n",
|
| 1107 |
+
" <td>0.000000</td>\n",
|
| 1108 |
+
" <td>0.0</td>\n",
|
| 1109 |
+
" <td>0.000000</td>\n",
|
| 1110 |
+
" <td>0.0</td>\n",
|
| 1111 |
+
" <td>0.0</td>\n",
|
| 1112 |
+
" <td>0.039841</td>\n",
|
| 1113 |
+
" <td>0.0</td>\n",
|
| 1114 |
+
" </tr>\n",
|
| 1115 |
+
" <tr>\n",
|
| 1116 |
+
" <th>4</th>\n",
|
| 1117 |
+
" <td>0.310237</td>\n",
|
| 1118 |
+
" <td>0.587482</td>\n",
|
| 1119 |
+
" <td>0.603694</td>\n",
|
| 1120 |
+
" <td>0.028047</td>\n",
|
| 1121 |
+
" <td>1.000000</td>\n",
|
| 1122 |
+
" <td>0.0</td>\n",
|
| 1123 |
+
" <td>0.563808</td>\n",
|
| 1124 |
+
" <td>0.0</td>\n",
|
| 1125 |
+
" <td>0.0</td>\n",
|
| 1126 |
+
" <td>0.630617</td>\n",
|
| 1127 |
+
" <td>...</td>\n",
|
| 1128 |
+
" <td>0.0</td>\n",
|
| 1129 |
+
" <td>0.0</td>\n",
|
| 1130 |
+
" <td>0.0</td>\n",
|
| 1131 |
+
" <td>0.099909</td>\n",
|
| 1132 |
+
" <td>0.0</td>\n",
|
| 1133 |
+
" <td>0.178126</td>\n",
|
| 1134 |
+
" <td>0.0</td>\n",
|
| 1135 |
+
" <td>0.0</td>\n",
|
| 1136 |
+
" <td>0.087996</td>\n",
|
| 1137 |
+
" <td>0.0</td>\n",
|
| 1138 |
+
" </tr>\n",
|
| 1139 |
+
" <tr>\n",
|
| 1140 |
+
" <th>...</th>\n",
|
| 1141 |
+
" <td>...</td>\n",
|
| 1142 |
+
" <td>...</td>\n",
|
| 1143 |
+
" <td>...</td>\n",
|
| 1144 |
+
" <td>...</td>\n",
|
| 1145 |
+
" <td>...</td>\n",
|
| 1146 |
+
" <td>...</td>\n",
|
| 1147 |
+
" <td>...</td>\n",
|
| 1148 |
+
" <td>...</td>\n",
|
| 1149 |
+
" <td>...</td>\n",
|
| 1150 |
+
" <td>...</td>\n",
|
| 1151 |
+
" <td>...</td>\n",
|
| 1152 |
+
" <td>...</td>\n",
|
| 1153 |
+
" <td>...</td>\n",
|
| 1154 |
+
" <td>...</td>\n",
|
| 1155 |
+
" <td>...</td>\n",
|
| 1156 |
+
" <td>...</td>\n",
|
| 1157 |
+
" <td>...</td>\n",
|
| 1158 |
+
" <td>...</td>\n",
|
| 1159 |
+
" <td>...</td>\n",
|
| 1160 |
+
" <td>...</td>\n",
|
| 1161 |
+
" <td>...</td>\n",
|
| 1162 |
+
" </tr>\n",
|
| 1163 |
+
" <tr>\n",
|
| 1164 |
+
" <th>2625</th>\n",
|
| 1165 |
+
" <td>0.241866</td>\n",
|
| 1166 |
+
" <td>0.242198</td>\n",
|
| 1167 |
+
" <td>0.217510</td>\n",
|
| 1168 |
+
" <td>0.000000</td>\n",
|
| 1169 |
+
" <td>0.178126</td>\n",
|
| 1170 |
+
" <td>0.0</td>\n",
|
| 1171 |
+
" <td>0.343441</td>\n",
|
| 1172 |
+
" <td>0.0</td>\n",
|
| 1173 |
+
" <td>0.0</td>\n",
|
| 1174 |
+
" <td>0.270278</td>\n",
|
| 1175 |
+
" <td>...</td>\n",
|
| 1176 |
+
" <td>0.0</td>\n",
|
| 1177 |
+
" <td>0.0</td>\n",
|
| 1178 |
+
" <td>0.0</td>\n",
|
| 1179 |
+
" <td>0.523597</td>\n",
|
| 1180 |
+
" <td>0.0</td>\n",
|
| 1181 |
+
" <td>1.000000</td>\n",
|
| 1182 |
+
" <td>0.0</td>\n",
|
| 1183 |
+
" <td>0.0</td>\n",
|
| 1184 |
+
" <td>0.186761</td>\n",
|
| 1185 |
+
" <td>0.0</td>\n",
|
| 1186 |
+
" </tr>\n",
|
| 1187 |
+
" <tr>\n",
|
| 1188 |
+
" <th>2626</th>\n",
|
| 1189 |
+
" <td>0.000000</td>\n",
|
| 1190 |
+
" <td>0.000000</td>\n",
|
| 1191 |
+
" <td>0.000000</td>\n",
|
| 1192 |
+
" <td>0.000000</td>\n",
|
| 1193 |
+
" <td>0.000000</td>\n",
|
| 1194 |
+
" <td>0.0</td>\n",
|
| 1195 |
+
" <td>0.000000</td>\n",
|
| 1196 |
+
" <td>0.0</td>\n",
|
| 1197 |
+
" <td>0.0</td>\n",
|
| 1198 |
+
" <td>0.000000</td>\n",
|
| 1199 |
+
" <td>...</td>\n",
|
| 1200 |
+
" <td>0.0</td>\n",
|
| 1201 |
+
" <td>0.0</td>\n",
|
| 1202 |
+
" <td>0.0</td>\n",
|
| 1203 |
+
" <td>0.000000</td>\n",
|
| 1204 |
+
" <td>0.0</td>\n",
|
| 1205 |
+
" <td>0.000000</td>\n",
|
| 1206 |
+
" <td>0.0</td>\n",
|
| 1207 |
+
" <td>0.0</td>\n",
|
| 1208 |
+
" <td>0.000000</td>\n",
|
| 1209 |
+
" <td>0.0</td>\n",
|
| 1210 |
+
" </tr>\n",
|
| 1211 |
+
" <tr>\n",
|
| 1212 |
+
" <th>2627</th>\n",
|
| 1213 |
+
" <td>0.000000</td>\n",
|
| 1214 |
+
" <td>0.000000</td>\n",
|
| 1215 |
+
" <td>0.000000</td>\n",
|
| 1216 |
+
" <td>0.000000</td>\n",
|
| 1217 |
+
" <td>0.000000</td>\n",
|
| 1218 |
+
" <td>0.0</td>\n",
|
| 1219 |
+
" <td>0.000000</td>\n",
|
| 1220 |
+
" <td>0.0</td>\n",
|
| 1221 |
+
" <td>0.0</td>\n",
|
| 1222 |
+
" <td>0.000000</td>\n",
|
| 1223 |
+
" <td>...</td>\n",
|
| 1224 |
+
" <td>0.0</td>\n",
|
| 1225 |
+
" <td>0.0</td>\n",
|
| 1226 |
+
" <td>0.0</td>\n",
|
| 1227 |
+
" <td>0.000000</td>\n",
|
| 1228 |
+
" <td>0.0</td>\n",
|
| 1229 |
+
" <td>0.000000</td>\n",
|
| 1230 |
+
" <td>0.0</td>\n",
|
| 1231 |
+
" <td>0.0</td>\n",
|
| 1232 |
+
" <td>0.000000</td>\n",
|
| 1233 |
+
" <td>0.0</td>\n",
|
| 1234 |
+
" </tr>\n",
|
| 1235 |
+
" <tr>\n",
|
| 1236 |
+
" <th>2628</th>\n",
|
| 1237 |
+
" <td>0.105519</td>\n",
|
| 1238 |
+
" <td>0.156851</td>\n",
|
| 1239 |
+
" <td>0.123984</td>\n",
|
| 1240 |
+
" <td>0.039841</td>\n",
|
| 1241 |
+
" <td>0.087996</td>\n",
|
| 1242 |
+
" <td>0.0</td>\n",
|
| 1243 |
+
" <td>0.164395</td>\n",
|
| 1244 |
+
" <td>0.0</td>\n",
|
| 1245 |
+
" <td>0.0</td>\n",
|
| 1246 |
+
" <td>0.097106</td>\n",
|
| 1247 |
+
" <td>...</td>\n",
|
| 1248 |
+
" <td>0.0</td>\n",
|
| 1249 |
+
" <td>0.0</td>\n",
|
| 1250 |
+
" <td>0.0</td>\n",
|
| 1251 |
+
" <td>0.082406</td>\n",
|
| 1252 |
+
" <td>0.0</td>\n",
|
| 1253 |
+
" <td>0.186761</td>\n",
|
| 1254 |
+
" <td>0.0</td>\n",
|
| 1255 |
+
" <td>0.0</td>\n",
|
| 1256 |
+
" <td>1.000000</td>\n",
|
| 1257 |
+
" <td>0.0</td>\n",
|
| 1258 |
+
" </tr>\n",
|
| 1259 |
+
" <tr>\n",
|
| 1260 |
+
" <th>2629</th>\n",
|
| 1261 |
+
" <td>0.000000</td>\n",
|
| 1262 |
+
" <td>0.000000</td>\n",
|
| 1263 |
+
" <td>0.000000</td>\n",
|
| 1264 |
+
" <td>0.000000</td>\n",
|
| 1265 |
+
" <td>0.000000</td>\n",
|
| 1266 |
+
" <td>0.0</td>\n",
|
| 1267 |
+
" <td>0.000000</td>\n",
|
| 1268 |
+
" <td>0.0</td>\n",
|
| 1269 |
+
" <td>0.0</td>\n",
|
| 1270 |
+
" <td>0.000000</td>\n",
|
| 1271 |
+
" <td>...</td>\n",
|
| 1272 |
+
" <td>0.0</td>\n",
|
| 1273 |
+
" <td>0.0</td>\n",
|
| 1274 |
+
" <td>0.0</td>\n",
|
| 1275 |
+
" <td>0.000000</td>\n",
|
| 1276 |
+
" <td>0.0</td>\n",
|
| 1277 |
+
" <td>0.000000</td>\n",
|
| 1278 |
+
" <td>0.0</td>\n",
|
| 1279 |
+
" <td>0.0</td>\n",
|
| 1280 |
+
" <td>0.000000</td>\n",
|
| 1281 |
+
" <td>0.0</td>\n",
|
| 1282 |
+
" </tr>\n",
|
| 1283 |
+
" </tbody>\n",
|
| 1284 |
+
"</table>\n",
|
| 1285 |
+
"<p>2630 rows × 2630 columns</p>\n",
|
| 1286 |
+
"</div>"
|
| 1287 |
+
],
|
| 1288 |
+
"text/plain": [
|
| 1289 |
+
"name Bivalirudin Leuprolide Goserelin Gramicidin D Desmopressin \\\n",
|
| 1290 |
+
"0 1.000000 0.306150 0.288741 0.083090 0.310237 \n",
|
| 1291 |
+
"1 0.306150 1.000000 0.780471 0.087487 0.587482 \n",
|
| 1292 |
+
"2 0.288741 0.780471 1.000000 0.110648 0.603694 \n",
|
| 1293 |
+
"3 0.083090 0.087487 0.110648 1.000000 0.028047 \n",
|
| 1294 |
+
"4 0.310237 0.587482 0.603694 0.028047 1.000000 \n",
|
| 1295 |
+
"... ... ... ... ... ... \n",
|
| 1296 |
+
"2625 0.241866 0.242198 0.217510 0.000000 0.178126 \n",
|
| 1297 |
+
"2626 0.000000 0.000000 0.000000 0.000000 0.000000 \n",
|
| 1298 |
+
"2627 0.000000 0.000000 0.000000 0.000000 0.000000 \n",
|
| 1299 |
+
"2628 0.105519 0.156851 0.123984 0.039841 0.087996 \n",
|
| 1300 |
+
"2629 0.000000 0.000000 0.000000 0.000000 0.000000 \n",
|
| 1301 |
+
"\n",
|
| 1302 |
+
"name Cetrorelix Daptomycin Abarelix Pyridoxal phosphate Cyanocobalamin \\\n",
|
| 1303 |
+
"0 0.0 0.338457 0.0 0.0 0.430352 \n",
|
| 1304 |
+
"1 0.0 0.645092 0.0 0.0 0.601979 \n",
|
| 1305 |
+
"2 0.0 0.614604 0.0 0.0 0.608478 \n",
|
| 1306 |
+
"3 0.0 0.190443 0.0 0.0 0.032498 \n",
|
| 1307 |
+
"4 0.0 0.563808 0.0 0.0 0.630617 \n",
|
| 1308 |
+
"... ... ... ... ... ... \n",
|
| 1309 |
+
"2625 0.0 0.343441 0.0 0.0 0.270278 \n",
|
| 1310 |
+
"2626 0.0 0.000000 0.0 0.0 0.000000 \n",
|
| 1311 |
+
"2627 0.0 0.000000 0.0 0.0 0.000000 \n",
|
| 1312 |
+
"2628 0.0 0.164395 0.0 0.0 0.097106 \n",
|
| 1313 |
+
"2629 0.0 0.000000 0.0 0.0 0.000000 \n",
|
| 1314 |
+
"\n",
|
| 1315 |
+
"name ... Naphthoquine Odevixibat Melphalan flufenamide Deucravacitinib \\\n",
|
| 1316 |
+
"0 ... 0.0 0.0 0.0 0.305532 \n",
|
| 1317 |
+
"1 ... 0.0 0.0 0.0 0.167036 \n",
|
| 1318 |
+
"2 ... 0.0 0.0 0.0 0.123966 \n",
|
| 1319 |
+
"3 ... 0.0 0.0 0.0 0.000000 \n",
|
| 1320 |
+
"4 ... 0.0 0.0 0.0 0.099909 \n",
|
| 1321 |
+
"... ... ... ... ... ... \n",
|
| 1322 |
+
"2625 ... 0.0 0.0 0.0 0.523597 \n",
|
| 1323 |
+
"2626 ... 0.0 0.0 0.0 0.000000 \n",
|
| 1324 |
+
"2627 ... 0.0 0.0 0.0 0.000000 \n",
|
| 1325 |
+
"2628 ... 0.0 0.0 0.0 0.082406 \n",
|
| 1326 |
+
"2629 ... 0.0 0.0 0.0 0.000000 \n",
|
| 1327 |
+
"\n",
|
| 1328 |
+
"name Tegoprazan Belumosudil Tebipenem pivoxil Tosufloxacin Linzagolix \\\n",
|
| 1329 |
+
"0 0.0 0.241866 0.0 0.0 0.105519 \n",
|
| 1330 |
+
"1 0.0 0.242198 0.0 0.0 0.156851 \n",
|
| 1331 |
+
"2 0.0 0.217510 0.0 0.0 0.123984 \n",
|
| 1332 |
+
"3 0.0 0.000000 0.0 0.0 0.039841 \n",
|
| 1333 |
+
"4 0.0 0.178126 0.0 0.0 0.087996 \n",
|
| 1334 |
+
"... ... ... ... ... ... \n",
|
| 1335 |
+
"2625 0.0 1.000000 0.0 0.0 0.186761 \n",
|
| 1336 |
+
"2626 0.0 0.000000 0.0 0.0 0.000000 \n",
|
| 1337 |
+
"2627 0.0 0.000000 0.0 0.0 0.000000 \n",
|
| 1338 |
+
"2628 0.0 0.186761 0.0 0.0 1.000000 \n",
|
| 1339 |
+
"2629 0.0 0.000000 0.0 0.0 0.000000 \n",
|
| 1340 |
+
"\n",
|
| 1341 |
+
"name Methionine C-11 \n",
|
| 1342 |
+
"0 0.0 \n",
|
| 1343 |
+
"1 0.0 \n",
|
| 1344 |
+
"2 0.0 \n",
|
| 1345 |
+
"3 0.0 \n",
|
| 1346 |
+
"4 0.0 \n",
|
| 1347 |
+
"... ... \n",
|
| 1348 |
+
"2625 0.0 \n",
|
| 1349 |
+
"2626 0.0 \n",
|
| 1350 |
+
"2627 0.0 \n",
|
| 1351 |
+
"2628 0.0 \n",
|
| 1352 |
+
"2629 0.0 \n",
|
| 1353 |
+
"\n",
|
| 1354 |
+
"[2630 rows x 2630 columns]"
|
| 1355 |
+
]
|
| 1356 |
+
},
|
| 1357 |
+
"execution_count": 9,
|
| 1358 |
+
"metadata": {},
|
| 1359 |
+
"output_type": "execute_result"
|
| 1360 |
+
}
|
| 1361 |
+
],
|
| 1362 |
+
"source": [
|
| 1363 |
+
"df_predicted = pd.DataFrame.sparse.from_spmatrix(links)\n",
|
| 1364 |
+
"df_predicted.columns = df_matrix.columns\n",
|
| 1365 |
+
"df_predicted"
|
| 1366 |
+
]
|
| 1367 |
+
},
|
| 1368 |
+
{
|
| 1369 |
+
"cell_type": "code",
|
| 1370 |
+
"execution_count": 10,
|
| 1371 |
+
"metadata": {},
|
| 1372 |
+
"outputs": [],
|
| 1373 |
+
"source": [
|
| 1374 |
+
"predictions = get_pairs_from_adjacency_matrix(df_predicted, excluded_pairs)"
|
| 1375 |
+
]
|
| 1376 |
+
},
|
| 1377 |
+
{
|
| 1378 |
+
"cell_type": "code",
|
| 1379 |
+
"execution_count": 11,
|
| 1380 |
+
"metadata": {},
|
| 1381 |
+
"outputs": [
|
| 1382 |
+
{
|
| 1383 |
+
"data": {
|
| 1384 |
+
"text/plain": [
|
| 1385 |
+
"130972.25693560754"
|
| 1386 |
+
]
|
| 1387 |
+
},
|
| 1388 |
+
"execution_count": 11,
|
| 1389 |
+
"metadata": {},
|
| 1390 |
+
"output_type": "execute_result"
|
| 1391 |
+
}
|
| 1392 |
+
],
|
| 1393 |
+
"source": [
|
| 1394 |
+
"predictions.sum()"
|
| 1395 |
+
]
|
| 1396 |
+
},
|
| 1397 |
+
{
|
| 1398 |
+
"cell_type": "code",
|
| 1399 |
+
"execution_count": 12,
|
| 1400 |
+
"metadata": {},
|
| 1401 |
+
"outputs": [
|
| 1402 |
+
{
|
| 1403 |
+
"data": {
|
| 1404 |
+
"text/plain": [
|
| 1405 |
+
"0.9689347313012144"
|
| 1406 |
+
]
|
| 1407 |
+
},
|
| 1408 |
+
"execution_count": 12,
|
| 1409 |
+
"metadata": {},
|
| 1410 |
+
"output_type": "execute_result"
|
| 1411 |
+
}
|
| 1412 |
+
],
|
| 1413 |
+
"source": [
|
| 1414 |
+
"import numpy as np\n",
|
| 1415 |
+
"from sklearn import metrics\n",
|
| 1416 |
+
"metrics.roc_auc_score(excluded_pair_values, predictions)"
|
| 1417 |
+
]
|
| 1418 |
+
},
|
| 1419 |
+
{
|
| 1420 |
+
"cell_type": "code",
|
| 1421 |
+
"execution_count": 13,
|
| 1422 |
+
"metadata": {},
|
| 1423 |
+
"outputs": [
|
| 1424 |
+
{
|
| 1425 |
+
"data": {
|
| 1426 |
+
"text/plain": [
|
| 1427 |
+
"<sklearn.metrics._plot.roc_curve.RocCurveDisplay at 0x1f500c9a550>"
|
| 1428 |
+
]
|
| 1429 |
+
},
|
| 1430 |
+
"execution_count": 13,
|
| 1431 |
+
"metadata": {},
|
| 1432 |
+
"output_type": "execute_result"
|
| 1433 |
+
},
|
| 1434 |
+
{
|
| 1435 |
+
"data": {
|
| 1436 |
+
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAYIAAAEGCAYAAABo25JHAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuNCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8QVMy6AAAACXBIWXMAAAsTAAALEwEAmpwYAAAw1UlEQVR4nO3dd5hV5bn38e89jaF3EClSBA1gRETsBo1dbEePwUQ8dn0tpLzxhPeYGKMmmpgTE2OLGo4dYomKJaKeiFiiAjrgAApIkSq9M3Xf7x9rzbin7jXDrBlm9u9zXXPNXm2ve+2Bde+nrOcxd0dERNJXRlMHICIiTUuJQEQkzSkRiIikOSUCEZE0p0QgIpLmspo6gLrq1q2b9+/fv6nDEBFpVmbPnr3B3btXt63ZJYL+/fsza9aspg5DRKRZMbPlNW1T1ZCISJpTIhARSXNKBCIiaU6JQEQkzSkRiIikudgSgZlNMrN1ZpZfw3Yzs3vMbLGZzTWzkXHFIiIiNYuzRPAocGot208DBoc/VwEPxBiLiIjUILbnCNx9hpn1r2WXs4HHPRgH+0Mz62Rmvdx9TVwxiewpd6eoNEFRSYLCsp/iUopLneLSBEWlCYpLEpQkgv1KSp3SRILiUqckESy7gxP8TiS9dnccwvVe/pvkdUmvG+Na635MHfev8xnqfo7gPHU7qH7nqOsBdT/JqP5dOG5Itc+E7ZGmfKCsN7AiaXlluK5KIjCzqwhKDfTr169RgpOWoagkwbaCYrbtLmbr7mK27C5me0EJOwpK2FFYzI7CUnYWlrCrqIQdhaXsKChmZ1GwbndxKQVFpRSUBDf+opLgRi/SUMzqtv813xnU4hJBdR9BtSnS3R8CHgIYNWqUZtJJI+7O7uJStuwqZvOuIrYXlLCzsIRtBcVs3lnMll1FbN5VXH6z31FYwtbdxWzeFdz4i0pS37jb5GTStlUWbXMyaZ+bTZucTHp2yKVNTiatszNplZ1BTmYmOVkZwU+mkZudSatwuVVWsC07M4OsTCMnM4OsDCM7K4PsjGBddqaRlZFBZoZhBmZGhoFRtvzN6wwzjOA34baydWWvoe43EQjOUaf963WOOu5fj5PUI6w6X0t94mqumjIRrAT6Ji33AVY3USzSiIpKEqzespt12wvZsKOQddsKWL+jkA3bi8KbeBGbdxWxZVfwDb62m7kZdGydTYfcbDq0zqJDbjYDurVlZJscOrbOpn1uFu1zs4N9WmfRsXUOHVtn0bZVFu1aZdE2J4uMjPT5Dy9SnaZMBFOB681sCnA4sFXtAy1DacJZvWU3SzfsZMXmXazYtJvlG3eyastuVm/ZzcadRVWqRzMzjC5tc+jcJptOrXMY2K0dndoEN/DObYObeuc22bTPzaZtqyw65GbRKbzZZ+pGLrJHYksEZjYZGAN0M7OVwC+BbAB3fxB4DTgdWAzsAi6NKxaJR0FxKfPXbGPR19tZsn4nX67fybKNO/lq064K3+KzM42+XdrQu1NrhvbqQM8OufTt0oYe7VvRPfzp0iZH38xFmkicvYYuTLHdgeviOr80nETCWbVlN4vX7eDztdv5Yu02FqzZzuL1OyhNBF/tczIz2K9rGwZ2a8sJB/ZgYLe2DOjWlr5d2tCzQ66+tYvsxZrdMNQSr627ilmwdhsLv97OF2u3s2DNNj5fu51dRaXl+/TqmMuB+7TnpKE9Gd67I0N7daB359a62Ys0U0oEaWzdtgLyV28lf9U2FqzZxpwVW1i9taB8e4fcLA7s1YELRvVlSM/2DO7ZjiE929OxdXYTRi0iDU2JIE1s3lnEZ6u2MnflFvJWBL/XbS8s396/axsO7d+Fi/ftwAH7tOfAfdqzT4fctOpCJ5KulAhaoNKE8+X6HeSt2MLsZZv5eNkmlm7YWb59YPe2HL1/Nw7q3TGo2tm3A+1a6Z+CSLrS//4WYMOOQmYu3cSnK7Ywd+UW8ldtY0dhCRD0sR+1X2fGHdaXYft25KA+HVW1IyIVKBE0Q2u27ubDJRuZtWwz7y7awFebdgGQk5XBt3p14NxDejOibycO7tuRgd3aqVumiNQqUiIwswzgYGBfYDcwz92/jjMw+cbmnUV88OVG3v9yA+8uWs+KTbsBaNcqiyMGduGiI/px6H6dOah3J3KyNMWEiNRNrYnAzAYBPwNOBBYB64FcYIiZ7QL+Ajzm7hqJqwEVlpQye9lm3l28gXe+WM/na7eR8LIbf1cuOWoAhw/owrd6dVCXTRHZY6lKBLcTzBNwtVcak9bMegDfB8YDj8UTXvpYt72AafO+Zvrn6/jgy43sLi4lM8M4rH9nbjhhMMcN6c63+3QkO1Pf+EWkYdWaCGp7Otjd1wF/bOiA0smGHYW8Me9rps5ZxUdLN+EO+3Vtw/mH9uE7Q7pz+MAutM9Vw66IxKvejcVmdpK7v9mQwaSD7QXFvDp3Da/MXcMHX24g4UF3zgknDOa0g/bhwH06NHWIIpJm9qTX0F8BzRITgbvz4ZJNPPXRct5a8DUFxQn269qGa8fsz+kH9eJbvdrrwS0RaTKpGoun1rQJ6Nrw4bQsm3YW8fRHy3l29kqWb9xFpzbZnH9oH/5tZB8O6dtJN38R2SukKhEcC1wE7Ki03oDRsUTUAixet4O/vreEFz9dze7iUo4Y2IUJJwzm9IN60Tons6nDExGpIFUi+BDY5e7vVN5gZl/EE1LztWbrbn73+he8mLeK7MwMzh3RmyuOHcDgnu2bOjQRkRql6jV0Wi3bjmv4cJqn4tIEk95byr3/XExhaYKrjxvEFccOoFu7Vk0dmohIShpiYg8tXredH/0tj/xV2zj+gO786qzh9OvapqnDEhGJTImgntydx/+1nF+/toB2rbJ48KKRnDq8V1OHJSJSZ0oE9VBQXMrE5+fyYt5qTjiwB3eedxA92uc2dVgiIvWiRFBH2wqKufKxWXy0dBM/OWkI1x+/v0b3FJFmLfLANWZ2S23L6WDt1gIuePBfzF6+mT+NG8GE7w5WEhCRZq8uJYLZKZZbtFVbdnPhQx+ycUch/3PpYRw7uHtThyQi0iAiJwJ3f7m25ZZs3bYCLnrkIzbvKuLJKw7nkH6dmzokEZEGk2qIiT8DXtN2d5/Q4BHtZbbuLuaS/5nJ19sKePyy0UoCItLipCoRzGqUKPZSJaUJrnp8FovWbeeR/ziMUf27NHVIIiINLtWTxRUmnDGztu6+M96Q9h6/f2MhHy3dxH//+8F8Z4jaBESkZYrUa8jMjjSz+cCCcPlgM7s/1sia2IyF6/nLjC8Zd1hfzju0T1OHIyISm6jdR/8InAJsBHD3OUCLHWtoZ2EJP3t+LoO6t+OXZw5r6nBERGIV+TkCd19RaVVpA8ey17jjHwtYu62A3553kIaNFpEWL2r30RVmdhTgZpYDTCCsJmpp8ldt5ckPv+Kyowdw6H5qHBaRli9qieAa4DqgN7AKGBEutzi/ff1zurTN4YcnDm7qUEREGkWkRODuG9z9B+7e0927u/tF7r4x1XFmdqqZfWFmi81sYjXbO5rZy2Y2x8zmmdml9bmIhvLuovW8u2gD144ZRMfW2U0ZiohIo4naa2hgeMNeb2brzOwlMxuY4phM4D7gNGAocKGZDa2023XAfHc/GBgD/HdY9dTo3J0/vrWIfTvmMv7I/ZoiBBGRJhG1auhp4BmgF7Av8CwwOcUxo4HF7r7E3YuAKcDZlfZxoL0Fs7i3AzYBJRFjalAzl21m9vLN/MdR/WmVpQZiEUkfUROBufsT7l4S/jxJLUNPhHoDyT2NVobrkt0LfAtYDXwG/NDdE1VObnaVmc0ys1nr16+PGHLdPP6vZXTIzVJpQETSTq2JwMy6mFkX4G0zm2hm/c1sPzP7T+DVFO9d3fjMlZPHKUAeQSljBHCvmXWocpD7Q+4+yt1Hde/e8E/4rt1awOv5azn/0L60ydEUDSKSXlLd9WYT3LzLbupXJ21z4LZajl0J9E1a7kPwzT/ZpcCd7u7AYjNbChwIfJwirgb17KwVlCSc/zhKpQERST+pxhoasAfvPRMYbGYDCLqcjgO+X2mfr4DvAu+aWU/gAGDJHpyzztydF/NWcfiALuzXtW1jnlpEZK8QuR7EzIYT9P4pn5zX3R+vaX93LzGz64FpQCYwyd3nmdk14fYHCUoUj5rZZwSljp+5+4Z6XUk9zV+zjS/X7+SyY/Yk54mINF+REoGZ/ZKge+dQ4DWCLqHvATUmAgB3fy3cP3ndg0mvVwMn1yniBvbPBesAOHnoPk0ZhohIk4naa+h8giqcte5+KXAw0Cq2qBrRjEXrOah3R7q3bxGXIyJSZ1ETwe6wW2dJ2KtnHVDrA2XNwbaCYj75agvHDu7W1KGIiDSZqG0Es8ysE/AwQU+iHTRyz544zF62mdKEc8z+SgQikr4iJQJ3vzZ8+aCZvQ50cPe58YXVOD5auomsDNM8xCKS1lJNXj+ytm3u/knDh9R4Zi3bxPDeHTXngIiktVQlgv+uZZsDJzRgLI2qoLiUuSu3cunR/Zs6FBGRJpXqgbLjGyuQxrZgzTaKShMc0q9TU4ciItKkIk9V2dIsWLMdgGH7dmziSEREmlbaJoKFX2+nTU4mvTu1bupQRESaVNomgi/Wbmdwz/ZkZFQ3SKqISPqIOkOZmdlFZnZzuNzPzEbHG1q8Fq3bwZAe7Zo6DBGRJhe1RHA/cCRwYbi8nWAaymZpW0ExG3YUMkiJQEQk8pPFh7v7SDP7FMDdNzfV3MINYen6nQAM6KZhp0VEopYIisPJ6B3AzLoDVaaUbC6Wb9oFQH/NPyAiEjkR3AO8APQws18TDEH9m9iiitmqzbsB6N1ZPYZERKKONfSUmc0mGIragHPcfUGskcVo9ZbddGydTbtWmp9YRCTqxDR/Av7m7s22gTjZ6i276dUxN/WOIiJpIGrV0CfAz81ssZndZWaj4gwqbqu3FrCvHiQTEQEiJgJ3f8zdTwdGAwuB35rZolgji9GarbvZt5NKBCIiUPcni/cHDgT6A583eDSNoLCklC27iunZXolARASiP1lcVgK4FZgHHOruZ8YaWUw27ywGoEu7ZvsYhIhIg4rabWYpcKS7b4gzmMawaWcRAJ3bKBGIiEDqGcoOdPfPCeYn7mdm/ZK3N8cZyrbsUiIQEUmWqkTwE+Aqqp+prFnOULZld1A11LltdhNHIiKyd0g1Q9lV4cvT3L0geZuZNcvW1i27gkTQsbUSgYgIRO819EHEdXu9rWGJoFNrVQ2JiEDqNoJ9gN5AazM7hGB4CYAOQJuYY4vFtoJisjON3Oy0nZNHRKSCVG0EpwCXAH2APySt3w78V0wxxWp7QTHtc7Mx08xkIiKQuo3gMeAxMzvP3Z9vpJhitb2gRIPNiYgkSVU1dJG7Pwn0N7OfVN7u7n+o5rC92g4lAhGRClJVlJfN3NIOaF/NT63M7FQz+yIcrG5iDfuMMbM8M5tnZu/UIfZ62VmkRCAikixV1dBfwt+/qusbhzOa3QecBKwEZprZVHefn7RPJ4L5kE9196/MrEddz1NXu4sT6joqIpIk6lhDvzOzDmaWbWb/a2YbzOyiFIeNBha7+xJ3LwKmAGdX2uf7wN/d/SsAd19X1wuoq8LiUnKz1GNIRKRM1Dviye6+DRhL8O1+CHBjimN6AyuSlleG65INATqb2XQzm21mF1f3RmZ2lZnNMrNZ69evjxhy9XYXl5KbnblH7yEi0pJETQRldSmnA5PdfVOEY6rrn+mVlrOAQ4EzCLqq/sLMhlQ5yP0hdx/l7qO6d+8eMeTqFRSX0iZHiUBEpEzUVtOXzexzYDdwrZl1BwpSHLMS6Ju03AdYXc0+G9x9J7DTzGYABxNMfhOLXUUqEYiIJIs6Q9lE4EhglLsXAzupWt9f2UxgsJkNMLMcYBwwtdI+LwHHmlmWmbUBDgcW1OUC6qqguJTWKhGIiJSLOnl9NjAeOC58Ivcd4MHajnH3EjO7HpgGZAKT3H2emV0Tbn/Q3ReY2evAXCABPOLu+fW+mhQSCae41GmlxmIRkXJRq4YeIGgnuD9cHh+uu6K2g9z9NeC1SuserLR8F3BXxDj2SFFpAoBWWSoRiIiUiZoIDnP3g5OW/2lmc+IIKE6FxWWJQCUCEZEyUe+IpWY2qGzBzAYCpfGEFJ/CkiDkHCUCEZFyUUsENwJvm9kSgm6h+wGXxhZVTApLVCIQEaksZSIIu4puJXhSuAdBIvjc3Qtjjq3BlSUClQhERL5R6x3RzK4A5gF/BvKA/u4+pzkmAfimakiNxSIi30hVIvgRMMzd14ftAk9R9VmAZqOovESgSWlERMqkqiMpcvf1AO6+BGgVf0jxKS4NRrjIyVSJQESkTKoSQR8zu6emZXefEE9Y8SgJnyPIzlSJQESkTKpEUHmE0dlxBdIYyh4oy8pUY7GISJkocxa3GCXlVUNKBCIiZVL1GnrIzIbXsK2tmV1mZj+IJ7SGV1xeIlDVkIhImVRVQ/cDN5vZQUA+sB7IBQYDHYBJBD2JmoXiRFAiUBuBiMg3UlUN5QEXmFk7YBTQi2BOggXu/kX84TWs0kRYIshQ1ZCISJlIQ0y4+w5geryhxK+sjSAzQyUCEZEyafXVuDSsGlIbgYjIN9IqEZS1EahEICLyjTolAjNrG1cgjaG07IEytRGIiJSLdEc0s6PMbD7hfMJmdrCZ3Z/isL1OSVgiyFCJQESkXNSvxncDpwAbAdx9DnBcXEHFJeGqGhIRqSxyHYm7r6i0qtnNUBYWCFAeEBH5RtQZylaY2VGAm1kOMIGwmqg5Kes1lGHKBCIiZaKWCK4BrgN6AyuBEcC1McUUm4R6DYmIVBG1RHCAu1cYU8jMjgbeb/iQ4lNa1kagEoGISLmoJYI/R1y3VytvI1CJQESkXK0lAjM7EjgK6G5mP0na1AFodtN8JRKuhmIRkUpSVQ3lAO3C/donrd8GnB9XUHEpdVf7gIhIJalGH30HeMfMHnX35Y0UU2yCEoESgYhIsqiNxbvM7C5gGMF8BAC4+wmxRBWT0oRKBCIilUVtLH4K+BwYAPwKWAbMjCmm2JS6q8eQiEglURNBV3f/K1Ds7u+4+2XAETHGFQt39RgSEaksatVQcfh7jZmdAawG+sQTUnxK1WtIRKSKqCWC282sI/B/gZ8CjwA/SnWQmZ1qZl+Y2WIzm1jLfoeZWamZxdoTKeFqLBYRqSzqVJWvhC+3AsdD+ZPFNTKzTOA+4CSCYSlmmtlUd59fzX6/BabVLfS6S6hqSESkilpLBGaWaWYXmtlPzWx4uG6smX0A3JvivUcDi919ibsXAVOAs6vZ7wbgeWBd3cOvGz1QJiJSVaoSwV+BvsDHwD1mthw4Epjo7i+mOLY3kDx09Urg8OQdzKw3cC5wAnBYTW9kZlcBVwH069cvxWlrllCvIRGRKlIlglHAt909YWa5wAZgf3dfG+G9q7vjeqXlPwI/c/dSq+UG7e4PAQ8BjBo1qvJ7RFbqTm3nERFJR6kSQZG7JwDcvcDMFkZMAhCUAPomLfch6G2UbBQwJbw5dwNON7OSCKWNenHXENQiIpWlSgQHmtnc8LUBg8JlA9zdv13LsTOBwWY2AFgFjAO+n7yDuw8oe21mjwKvxJUEQN1HRUSqkyoRfKu+b+zuJWZ2PUFvoExgkrvPM7Nrwu0P1ve960vdR0VEqko16NweDTTn7q8Br1VaV20CcPdL9uRcUSQ0+qiISBWRJ69vCRIJzVcsIlJZeiUCd5QHREQqipwIzKy1mR0QZzBxS7hKBCIilUVKBGZ2JpAHvB4ujzCzqTHGFYuEOxlpVQYSEUkt6m3xFoIhI7YAuHse0D+OgOKkJ4tFRKqKmghK3H1rrJE0goSjJ4tFRCqJOh9Bvpl9H8g0s8HABOCD+MKKh7seKBMRqSxqieAGgvmKC4GnCYaj/lFMMcVGD5SJiFQVtURwgLvfBNwUZzBx03MEIiJVRS0R/MHMPjez28xsWKwRxUjPEYiIVBUpEbj78cAYYD3wkJl9ZmY/jzOwOKhqSESkqsi96t19rbvfA1xD8EzBzXEFFZdgqsqmjkJEZO8S9YGyb5nZLWaWTzBF5QcE8ws0K+6OVTtfjohI+oraWPw/wGTgZHevPLlMs6HJ60VEqoqUCNz9iLgDaQxBiUBERJLVmgjM7Bl3v8DMPqPifMNRZijb6zjogTIRkUpSlQh+GP4eG3cgjSGhyetFRKqotbHY3deEL6919+XJP8C18YfXsNxVIhARqSxqZ8qTqll3WkMG0hgSDqiVQESkglRtBP+H4Jv/QDObm7SpPfB+nIHFQYPOiYhUlaqN4GngH8AdwMSk9dvdfVNsUcXENUOZiEgVqRKBu/syM7uu8gYz69LckoHGGhIRqSpKiWAsMJug92XybdSBgTHFFYug+6gygYhIsloTgbuPDX8PaJxw4pVwV1uxiEglUccaOtrM2oavLzKzP5hZv3hDi4HaCEREqojaffQBYJeZHQz8J7AceCK2qGKS0BATIiJV1GXyegfOBv7k7n8i6ELarCT0QJmISBVRRx/dbmb/DxgPHGtmmUB2fGHFw9HENCIilUUtEXyPYOL6y9x9LdAbuCu2qGKSSKDGYhGRSqJOVbkWeAroaGZjgQJ3fzzWyGKiEoGISEVRew1dAHwM/DtwAfCRmZ0f4bhTzewLM1tsZhOr2f4DM5sb/nwQNkbHRo3FIiJVRW0juAk4zN3XAZhZd+At4LmaDgjbEe4jGLBuJTDTzKa6+/yk3ZYC33H3zWZ2GvAQcHjdLyMaDTEhIlJV1DaCjLIkENoY4djRwGJ3X+LuRcAUgl5H5dz9A3ffHC5+SMzzIGuICRGRqqKWCF43s2kE8xZD0Hj8WopjegMrkpZXUvu3/csJBrirwsyuAq4C6Nev/s+xefBe9T5eRKQlijpn8Y1m9m/AMQT9bh5y9xdSHFbdHderWYeZHU+QCI6p4fwPEVQbMWrUqGrfIwpXiUBEpIpU8xEMBn4PDAI+A37q7qsivvdKoG/Sch9gdTXn+DbwCHCau2+M+N71ogfKRESqSlXPPwl4BTiPYATSP9fhvWcCg81sgJnlAOOAqck7hOMV/R0Y7+4L6/De9RJMTKNMICKSLFXVUHt3fzh8/YWZfRL1jd29xMyuB6YBmcAkd59nZteE2x8Ebga6AveHdfcl7j6qrhcRVUKDj4qIVJEqEeSa2SF8c/9snbzs7rUmBnd/jUqNymECKHt9BXBFXYOur6CNQKlARCRZqkSwBvhD0vLapGUHTogjqLi4o8ZiEZFKUk1Mc3xjBdIYNEOZiEhVUR8oaxES7uo1JCJSSdolArURiIhUlFaJQG0EIiJVRR191MK5im8Ol/uZ2eh4Q2t4wdz1ygQiIsmilgjuB44ELgyXtxOMLNqsBDOUNXUUIiJ7l6iDzh3u7iPN7FOAcNjonBjjikVCVUMiIlVELREUh/MLOJTPR5CILaqYaIgJEZGqoiaCe4AXgB5m9mvgPeA3sUUVA3cPSwRKBCIiyaIOQ/2Umc0GvkswvMQ57r4g1sgamIeDV2cqEYiIVBApEYSjhO4CXk5e5+5fxRVYQ0uEmUCNxSIiFUVtLH6VcIIvIBcYAHwBDIsprgaXCEsEGcoEIiIVRK0aOih52cxGAlfHElFMykoEqhkSEamoXk8Wh8NPH9bAscSqrI1AvYZERCqK2kbwk6TFDGAksD6WiGKiNgIRkepFbSNon/S6hKDN4PmGDyc+3yQCZQIRkWQpE0H4IFk7d7+xEeKJTVljsZ4jEBGpqNY2AjPLcvdSgqqgZs1VNSQiUq1UJYKPCZJAnplNBZ4FdpZtdPe/xxhbgypNqGpIRKQ6UdsIugAbCeYoLnuewIFmkwjKnyNQHhARqSBVIugR9hjK55sEUMZjiyoG5VVDygSylyguLmblypUUFBQ0dSjSguTm5tKnTx+ys7MjH5MqEWQC7aDa2VyaVSIoSTSrcCUNrFy5kvbt29O/f391YpAG4e5s3LiRlStXMmDAgMjHpUoEa9z91j0Lbe9Qlga2F5Q0aRwiZQoKCpQEpEGZGV27dmX9+ro95pXqyeIW8y80EZYIurZtdvPpSAumJCANrT7/plIlgu/WL5S9T1mvoaxM/ccTEUlWayJw902NFUjcStR9VKSKtWvXMm7cOAYNGsTQoUM5/fTTWbhwIcuWLWP48OENdp6bb76Zt956C4B3332XYcOGMWLECFatWsX555+/R+/t7pxwwgls27atfN0LL7yAmfH555+Xr5s+fTpjx46tcOwll1zCc889BwSN9xMnTmTw4MEMHz6c0aNH849//GOPYgO444472H///TnggAOYNm1atfvMmTOHI488koMOOogzzzyz/FqeeuopRowYUf6TkZFBXl4eACeeeCKbN2/e4/ignoPONUdlQ0xkqteQCBDcQM8991zGjBnDl19+yfz58/nNb37D119/3eDnuvXWWznxxBOB4Ob205/+lLy8PHr37l1+I46itLS0yrrXXnuNgw8+mA4dOpSvmzx5MscccwxTpkyJ/N6/+MUvWLNmDfn5+eTn5/Pyyy+zffv2yMdXZ/78+UyZMoV58+bx+uuvc+2111Z7DVdccQV33nknn332Geeeey533XUXAD/4wQ/Iy8sjLy+PJ554gv79+zNixAgAxo8fz/33379H8ZWJ+hxBs1deNaREIHuhX708j/mrt6XesQ6G7tuBX55Z85Qhb7/9NtnZ2VxzzTXl68puMsuWLStft2zZMsaPH8/OncGzpPfeey9HHXUUa9as4Xvf+x7btm2jpKSEBx54gKOOOorLL7+cWbNmYWZcdtll/PjHP+aSSy5h7NixbNmyhWeeeYZp06bx1ltv8etf/5qxY8eSn59PaWkpEydOZPr06RQWFnLddddx9dVXM336dH71q1/Rq1cv8vLymD9/foXreOqpp7jqqqvKl3fs2MH777/P22+/zVlnncUtt9yS8rPatWsXDz/8MEuXLqVVq1YA9OzZkwsuuCDlsbV56aWXGDduHK1atWLAgAHsv//+fPzxxxx55JEV9vviiy847rjjADjppJM45ZRTuO222yrsM3nyZC688MLy5bPOOotjjz2Wm266aY9ihDRMBKoaEgnk5+dz6KGHptyvR48evPnmm+Tm5rJo0SIuvPBCZs2axdNPP80pp5zCTTfdRGlpKbt27SIvL49Vq1aRn58PwJYtWyq81xVXXMF7773H2LFjOf/88ysknL/+9a907NiRmTNnUlhYyNFHH83JJ58MwMcff0x+fn61XSLff/99/vKXv5Qvv/jii5x66qkMGTKELl268MknnzByZO2j5CxevJh+/fpVKFXU5Mc//jFvv/12lfXjxo1j4sSJFdatWrWKI444ony5T58+rFq1qsqxw4cPZ+rUqZx99tk8++yzrFixoso+f/vb33jppZfKlzt37kxhYSEbN26ka9euKeOuTdokAlUNyd6stm/uTa24uJjrr7+evLw8MjMzWbhwIQCHHXYYl112GcXFxZxzzjmMGDGCgQMHsmTJEm644QbOOOOM8ht5FG+88QZz584tryraunUrixYtIicnh9GjR9fYL37Tpk20b//NAMmTJ0/mRz/6ERDcnCdPnszIkSNr7E1T1142d999d+R9yx5kTXW+SZMmMWHCBG699VbOOusscnIq9m786KOPaNOmTZV2mx49erB69eq9OxGY2anAnwgeTHvE3e+stN3C7acTzIl8STjpTYMrLlUiEEk2bNiwSPXzd999Nz179mTOnDkkEglyc3MBOO6445gxYwavvvoq48eP58Ybb+Tiiy9mzpw5TJs2jfvuu49nnnmGSZMmRYrH3fnzn//MKaecUmH99OnTadu2bY3HZWVlkUgkyMjIYOPGjfzzn/8kPz8fM6O0tBQz43e/+x1du3at0ri6adMmunXrxv77789XX33F9u3bKySV6tSlRNCnT58K3+5XrlzJvvvuW+XYAw88kDfeeAOAhQsX8uqrr1bYPmXKlArVQmUKCgpo3bp1rfFGEVtjcTh89X3AacBQ4EIzG1ppt9OAweHPVcADccWzuyhooGnbKm0KQSK1OuGEEygsLOThhx8uXzdz5kzeeeedCvtt3bqVXr16kZGRwRNPPFHe2Ll8+XJ69OjBlVdeyeWXX84nn3zChg0bSCQSnHfeedx222188kn073WnnHIKDzzwAMXFxUBwQyxrl6jNAQccwJIlSwB47rnnuPjii1m+fDnLli1jxYoVDBgwgPfee4/BgwezevVqFixYUB7/nDlzGDFiBG3atOHyyy9nwoQJFBUVAbBmzRqefPLJKue7++67yxtwk38qJwEI6vGnTJlCYWEhS5cuZdGiRYwePbrKfuvWrQMgkUhw++23V2i3SSQSPPvss4wbN67CMe7O2rVr6d+/f8rPKJU4ew2NBha7+xJ3LwKmAGdX2uds4HEPfAh0MrNecQSzozB4orhNTmYcby/S7JgZL7zwAm+++SaDBg1i2LBh3HLLLVW+sV577bU89thjHHHEESxcuLD82/n06dMZMWIEhxxyCM8//zw//OEPWbVqFWPGjGHEiBFccskl3HHHHZHjueKKKxg6dCgjR45k+PDhXH311ZSUpB4J4IwzzmD69OlAUC107rnnVth+3nnn8fTTT9OqVSuefPJJLr30UkaMGMH555/PI488QseOHQG4/fbb6d69O0OHDmX48OGcc845dO/ePXL81Rk2bBgXXHABQ4cO5dRTT+W+++4jMzOz/HpnzZpVHveQIUM48MAD2Xfffbn00kvL32PGjBn06dOHgQMHVnjv2bNnc8QRR5CV1QBfbt09lh/gfILqoLLl8cC9lfZ5BTgmafl/gVHVvNdVwCxgVr9+/bw+Zi3b6Nc8McvXbNldr+NFGtr8+fObOoQWYfXq1X7iiSc2dRiNbsKECf7WW29Vu626f1vALK/hfh1nPUmUgeoiDWbn7g8BDwGMGjWqXqPHHbpfFw7dr0t9DhWRvVivXr248sor2bZtW6RePy3F8OHD+e53G2bwhzgTwUqgb9JyH2B1PfYREanVnvb3b46uvPLKBnuvONsIZgKDzWyAmeUA44CplfaZClxsgSOAre6+JsaYRPYqXk33QpE9UZ9/U7GVCNy9xMyuB6YRdB+d5O7zzOyacPuDwGsEXUcXE3QfvbSm9xNpaXJzc8sfBtIopNIQPJyPoKyLb1TW3L6RjBo1ysta2kWaM81QJnGoaYYyM5vt7qOqO0ad6kWaSHZ2dp1mkRKJS9qMPioiItVTIhARSXNKBCIiaa7ZNRab2XpgeT0P7wZsaMBwmgNdc3rQNaeHPbnm/dy92jEzml0i2BNmNqumVvOWStecHnTN6SGua1bVkIhImlMiEBFJc+mWCB5q6gCagK45Peia00Ms15xWbQQiIlJVupUIRESkEiUCEZE01yITgZmdamZfmNliM6sykWg47PU94fa5ZjayKeJsSBGu+Qfhtc41sw/M7OCmiLMhpbrmpP0OM7NSMzu/MeOLQ5RrNrMxZpZnZvPM7J3q9mlOIvzb7mhmL5vZnPCam/UoxmY2yczWmVl+Ddsb/v5V09RlzfWHYMjrL4GBQA4wBxhaaZ/TgX8QzJB2BPBRU8fdCNd8FNA5fH1aOlxz0n7/JBjy/PymjrsR/s6dgPlAv3C5R1PH3QjX/F/Ab8PX3YFNQE5Tx74H13wcMBLIr2F7g9+/WmKJYDSw2N2XuHsRMAU4u9I+ZwOPe+BDoJOZ9WrsQBtQymt29w/cfXO4+CHBbHDNWZS/M8ANwPPAusYMLiZRrvn7wN/d/SsAd2/u1x3lmh1ob8GkDu0IEkHqWe/3Uu4+g+AaatLg96+WmAh6AyuSlleG6+q6T3NS1+u5nOAbRXOW8prNrDdwLvBgI8YVpyh/5yFAZzObbmazzeziRosuHlGu+V7gWwTT3H4G/NDdE40TXpNo8PtXS5yPoLqpnir3kY2yT3MS+XrM7HiCRHBMrBHFL8o1/xH4mbuXtpAZwKJccxZwKPBdoDXwLzP70N0Xxh1cTKJc8ylAHnACMAh408zedfdtMcfWVBr8/tUSE8FKoG/Sch+Cbwp13ac5iXQ9ZvZt4BHgNHff2EixxSXKNY8CpoRJoBtwupmVuPuLjRJhw4v6b3uDu+8EdprZDOBgoLkmgijXfClwpwcV6IvNbClwIPBx44TY6Br8/tUSq4ZmAoPNbICZ5QDjgKmV9pkKXBy2vh8BbHX3NY0daANKec1m1g/4OzC+GX87TJbymt19gLv3d/f+wHPAtc04CUC0f9svAceaWZaZtQEOBxY0cpwNKco1f0VQAsLMegIHAEsaNcrG1eD3rxZXInD3EjO7HphG0ONgkrvPM7Nrwu0PEvQgOR1YDOwi+EbRbEW85puBrsD94TfkEm/GIzdGvOYWJco1u/sCM3sdmAskgEfcvdpuiM1BxL/zbcCjZvYZQbXJz9y92Q5PbWaTgTFANzNbCfwSyIb47l8aYkJEJM21xKohERGpAyUCEZE0p0QgIpLmlAhERNKcEoGISJpTIkgD4cibeUk//WvZd0cDnO9RM1sanusTMzuyHu/xiJkNDV//V6VtH+xpjOH7lH0u+eHolZ1S7D/CzE6vx3l6mdkr4esxZrbVzD41swVm9st6vN9ZZaNwmtk5ZZ9TuHyrmZ1Y1/es5hyPWorRWsNhLCJ3QQ6v/ZUI+1U7+qaZ/d7MToh6PolOiSA97Hb3EUk/yxrhnDe6+whgIvCXuh7s7le4+/xw8b8qbTtqz8MDvvlchhMM8nVdiv1HEPTfrqufAA8nLb/r7ocQPPl8kZkdWpc3c/ep7n5nuHgOMDRp283u/lY9YtybPAqcWs36PxP8e5IGpkSQhsysnZn9b/ht/TMzqzJqZ/gtdkbSN+Zjw/Unm9m/wmOfNbN2KU43A9g/PPYn4Xvlm9mPwnVtzexVC8aSzzez74Xrp5vZKDO7E2gdxvFUuG1H+Ptvyd/Qw2+x55lZppndZWYzLRiv/eoIH8u/CAfuMrPRFszZ8Gn4+4DwqdZbge+FsXwvjH1SeJ5Pq/scQ+cBr1deGQ4DMRsYFJY2PgzjfcHMOoexTDCz+eH6KeG6S8zsXjM7CjgLuCuMaVDZN3kzO83Mnkn6bMaY2cvh6zr9Dc3s5vAa883sIbMKAzddFH5G+WY2Otw/6udSrZpG33T35UBXM9unLu8nETTWGNv6abofoJRgUK484AWCJ8o7hNu6ETyhWPZw4Y7w9/8FbgpfZwLtw31nAG3D9T8Dbq7mfI8Sjv0P/DvwEcFAaJ8BbQmGCp4HHEJwk3w46diO4e/pwKjkmJL2KYvxXOCx8HUOwYiMrYGrgJ+H61sBs4AB1cS5I+n6ngVODZc7AFnh6xOB58PXlwD3Jh3/G+Ci8HUngvF82lY6xwBgdtLyGOCV8HVXYBkwjOBJ4O+E628F/hi+Xg20KjtH5TiSP+vk5fBv/FXS3+oB4KJ6/g27JK1/Ajgz6W/0cPj6OMLx82v6XCpd+yiCp55r+jfbn2rG4ycoWZ3X1P+nWtpPixtiQqq124NqGgDMLBv4jZkdRzAMQW+gJ7A26ZiZwKRw3xfdPc/MvkNQDfF++KUwh+CbdHXuMrOfA+sJRjv9LvCCB9+CMbO/A8cSfFP+vZn9luAm8W4drusfwD1m1oqgKmGGu+82s5OBbyfVcXcEBgNLKx3f2szyCG46s4E3k/Z/zMwGE4zqmF3D+U8GzjKzn4bLuUA/Ko7t0yv8DJIda2afEnz2dxIMItbJ3ctmE3uMIDFBkCCeMrMXgRdriKMKD4ZmeB0408yeA84A/hOoy9+wzPFm9p9AG6ALQRJ/Odw2OTzfDDPrYEE7S02fS3J8s4Arol5PknXAvvU4TmqhRJCefkAwk9Oh7l5sZssI/rOWC/9jH0dwA3nCzO4CNgNvuvuFEc5xo7s/V7ZgNTRguvvCsI78dOAOM3vD3W+NchHuXmBm0wmGIf4e4U2JYLyZG9x9Woq32O3uI8ysI/AKQRvBPQRj17zt7uda0LA+vYbjjeDb6Re1nYNKny1BG8HY8jcJzl+TMwi+bZ8F/MLMhtWyb2V/I7imTcBMd98eVutE/RtiZrnA/QSlsxVmdgsVr6fyGDVODZ+LBQPC7alcgs9UGpDaCNJTR2BdmASOB/arvIOZ7Rfu8zDwV4Kp8z4Ejjazsjr/NmY2JOI5ZwDnhMe0JajWedfM9gV2ufuTwO/D81RWHJZMqjOFYNCtYwkGJiP8/X/KjjGzIeE5q+XuW4EJwE/DYzoCq8LNlyTtup2giqzMNOCGsjpzMzukmrdfSFDiqFF4/s0WtsMA44F3zCwD6OvubxN8m+9EUK2WrHJMyaYTfJ5XEiQFqPvfsOymvyFsS6jck6isTecYglEwtxLtc6mvIUCzHURvb6VEkJ6eAkaZ2SyC0sHn1ewzBsgLqzDOA/7k7usJboyTzWwuwU3lwCgndPdPCOqdPyZoM3jE3T8FDgI+DqtobgJur+bwh4C5FjYWV/IGwTfmtzyYyhCCORfmA59Y0AXxL6Qo/YaxzCEY5vh3BKWT9wnaD8q8DQwtaywmKDlkh7Hlh8uV33cn8GXZjbcW/0FQnTaXoHfSreG5n7RgVM1PgbvdfUul46YAN4aNsoMqnbuUoKRzWvibuv4Nw/M9TNC+8yJBlWGyzRZ0532QoAoQInwuFnQEeKS6c1ow+ua/gAPMbKWZXR6uzyboeDCrpnilfjT6qEjMzOxcgmq4nzd1LM1Z+DmOdPdfNHUsLY3aCERi5u4vmFnXpo6jBcgC/rupg2iJVCIQEUlzaiMQEUlzSgQiImlOiUBEJM0pEYiIpDklAhGRNPf/AaHCUMohgRR1AAAAAElFTkSuQmCC",
|
| 1437 |
+
"text/plain": [
|
| 1438 |
+
"<Figure size 432x288 with 1 Axes>"
|
| 1439 |
+
]
|
| 1440 |
+
},
|
| 1441 |
+
"metadata": {
|
| 1442 |
+
"needs_background": "light"
|
| 1443 |
+
},
|
| 1444 |
+
"output_type": "display_data"
|
| 1445 |
+
}
|
| 1446 |
+
],
|
| 1447 |
+
"source": [
|
| 1448 |
+
"from sklearn.metrics import RocCurveDisplay\n",
|
| 1449 |
+
"RocCurveDisplay.from_predictions(excluded_pair_values, predictions)"
|
| 1450 |
+
]
|
| 1451 |
+
},
|
| 1452 |
+
{
|
| 1453 |
+
"cell_type": "code",
|
| 1454 |
+
"execution_count": null,
|
| 1455 |
+
"metadata": {},
|
| 1456 |
+
"outputs": [],
|
| 1457 |
+
"source": []
|
| 1458 |
+
}
|
| 1459 |
+
],
|
| 1460 |
+
"metadata": {
|
| 1461 |
+
"kernelspec": {
|
| 1462 |
+
"display_name": "base",
|
| 1463 |
+
"language": "python",
|
| 1464 |
+
"name": "python3"
|
| 1465 |
+
},
|
| 1466 |
+
"language_info": {
|
| 1467 |
+
"codemirror_mode": {
|
| 1468 |
+
"name": "ipython",
|
| 1469 |
+
"version": 3
|
| 1470 |
+
},
|
| 1471 |
+
"file_extension": ".py",
|
| 1472 |
+
"mimetype": "text/x-python",
|
| 1473 |
+
"name": "python",
|
| 1474 |
+
"nbconvert_exporter": "python",
|
| 1475 |
+
"pygments_lexer": "ipython3",
|
| 1476 |
+
"version": "3.8.8"
|
| 1477 |
+
},
|
| 1478 |
+
"orig_nbformat": 4
|
| 1479 |
+
},
|
| 1480 |
+
"nbformat": 4,
|
| 1481 |
+
"nbformat_minor": 2
|
| 1482 |
+
}
|
parse.ipynb
ADDED
|
@@ -0,0 +1,698 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"attachments": {},
|
| 5 |
+
"cell_type": "markdown",
|
| 6 |
+
"metadata": {},
|
| 7 |
+
"source": [
|
| 8 |
+
"# Convert the DrugBank XML databse to JSON and extract features"
|
| 9 |
+
]
|
| 10 |
+
},
|
| 11 |
+
{
|
| 12 |
+
"cell_type": "markdown",
|
| 13 |
+
"metadata": {},
|
| 14 |
+
"source": [
|
| 15 |
+
"Run using Python 3 to avoid a non-ascii character error when writing to file with the csv module."
|
| 16 |
+
]
|
| 17 |
+
},
|
| 18 |
+
{
|
| 19 |
+
"cell_type": "code",
|
| 20 |
+
"execution_count": 120,
|
| 21 |
+
"metadata": {
|
| 22 |
+
"collapsed": true
|
| 23 |
+
},
|
| 24 |
+
"outputs": [],
|
| 25 |
+
"source": [
|
| 26 |
+
"import os\n",
|
| 27 |
+
"import csv\n",
|
| 28 |
+
"import gzip\n",
|
| 29 |
+
"import collections\n",
|
| 30 |
+
"import re\n",
|
| 31 |
+
"import io\n",
|
| 32 |
+
"import json\n",
|
| 33 |
+
"import xml.etree.ElementTree as ET\n",
|
| 34 |
+
"import requests\n",
|
| 35 |
+
"import pandas\n",
|
| 36 |
+
"import xmltodict\n",
|
| 37 |
+
"import json"
|
| 38 |
+
]
|
| 39 |
+
},
|
| 40 |
+
{
|
| 41 |
+
"cell_type": "code",
|
| 42 |
+
"execution_count": 3,
|
| 43 |
+
"metadata": {},
|
| 44 |
+
"outputs": [],
|
| 45 |
+
"source": [
|
| 46 |
+
"xml_path = \"data/full_database.xml\"\n",
|
| 47 |
+
"json_path = \"data/full_database.json\""
|
| 48 |
+
]
|
| 49 |
+
},
|
| 50 |
+
{
|
| 51 |
+
"cell_type": "code",
|
| 52 |
+
"execution_count": 2,
|
| 53 |
+
"metadata": {},
|
| 54 |
+
"outputs": [],
|
| 55 |
+
"source": [
|
| 56 |
+
"# Read the XML file\n",
|
| 57 |
+
"\n",
|
| 58 |
+
"with open('data/full_database.xml', encoding=\"UTF8\") as f:\n",
|
| 59 |
+
" db = xmltodict.parse(f.read())\n",
|
| 60 |
+
"\n",
|
| 61 |
+
"json_obj = json.dumps(db, indent=4)\n",
|
| 62 |
+
"\n",
|
| 63 |
+
"# output as json\n",
|
| 64 |
+
"with open(\"data/full_database.json\", \"w\") as outfile:\n",
|
| 65 |
+
" outfile.write(json_obj)"
|
| 66 |
+
]
|
| 67 |
+
},
|
| 68 |
+
{
|
| 69 |
+
"cell_type": "code",
|
| 70 |
+
"execution_count": 173,
|
| 71 |
+
"metadata": {},
|
| 72 |
+
"outputs": [],
|
| 73 |
+
"source": [
|
| 74 |
+
"desired_props_exp = set([\"Water Solubility\",\n",
|
| 75 |
+
" \"Melting Point\",\n",
|
| 76 |
+
" \"Boiling Point\",\n",
|
| 77 |
+
" \"logP\",\n",
|
| 78 |
+
" \"logS\",\n",
|
| 79 |
+
" \"Hydrophobicity\",\n",
|
| 80 |
+
" \"Isoelectric Point\",\n",
|
| 81 |
+
" \"caco2 Permeability\",\n",
|
| 82 |
+
" \"pKa\",\n",
|
| 83 |
+
" \"Molecular Weight\",\n",
|
| 84 |
+
" \"Radioactivity\"])\n",
|
| 85 |
+
"\n",
|
| 86 |
+
"desired_props_calc = set([\"logP\",\n",
|
| 87 |
+
" \"logS\",\n",
|
| 88 |
+
" \"Water Solubility\",\n",
|
| 89 |
+
" \"Molecular Weight\",\n",
|
| 90 |
+
" \"Monoisotopic Weight\",\n",
|
| 91 |
+
" \"Polar Surface Area (PSA)\",\n",
|
| 92 |
+
" \"Refractivity\",\n",
|
| 93 |
+
" \"Polarizability\",\n",
|
| 94 |
+
" \"Rotatable Bond Count\",\n",
|
| 95 |
+
" \"H Bond Acceptor Count\",\n",
|
| 96 |
+
" \"H Bond Donor Count\",\n",
|
| 97 |
+
" \"pKa (strongest acidic)\",\n",
|
| 98 |
+
" \"pKa (strongest basic)\",\n",
|
| 99 |
+
" \"Physiological Charge\",\n",
|
| 100 |
+
" \"Number of Rings\",\n",
|
| 101 |
+
" \"Bioavailability\",\n",
|
| 102 |
+
" \"Rule of Five\",\n",
|
| 103 |
+
" \"Ghose Filter\",\n",
|
| 104 |
+
" \"MDDR-Like Rule\",\n",
|
| 105 |
+
" \"Veber's Rule\"])\n",
|
| 106 |
+
"\n",
|
| 107 |
+
"def getProperties(desired_props, props, row):\n",
|
| 108 |
+
" for prop in desired_props:\n",
|
| 109 |
+
" if prop not in row:\n",
|
| 110 |
+
" row[prop] = None\n",
|
| 111 |
+
"\n",
|
| 112 |
+
" try:\n",
|
| 113 |
+
" for prop in props:\n",
|
| 114 |
+
" if(prop['kind'] in desired_props):\n",
|
| 115 |
+
" match = re.search(r\"[-+]?[0-9]*\\.?[0-9]+([eE][-+]?[0-9]+)?\", prop['value'])\n",
|
| 116 |
+
" row[prop['kind']] = float(match.group(0))\n",
|
| 117 |
+
" except:\n",
|
| 118 |
+
" pass"
|
| 119 |
+
]
|
| 120 |
+
},
|
| 121 |
+
{
|
| 122 |
+
"cell_type": "code",
|
| 123 |
+
"execution_count": 12,
|
| 124 |
+
"metadata": {},
|
| 125 |
+
"outputs": [],
|
| 126 |
+
"source": [
|
| 127 |
+
"with open(json_path) as f:\n",
|
| 128 |
+
" data = json.load(f)"
|
| 129 |
+
]
|
| 130 |
+
},
|
| 131 |
+
{
|
| 132 |
+
"cell_type": "code",
|
| 133 |
+
"execution_count": 174,
|
| 134 |
+
"metadata": {},
|
| 135 |
+
"outputs": [],
|
| 136 |
+
"source": [
|
| 137 |
+
"rows = []\n",
|
| 138 |
+
"for i in range(15235):\n",
|
| 139 |
+
" row = {}\n",
|
| 140 |
+
" drug = data['drugbank']['drug'][i]\n",
|
| 141 |
+
" row['name'] = drug['name']\n",
|
| 142 |
+
" row['state'] = drug.get('state', None)\n",
|
| 143 |
+
" atc_code = None\n",
|
| 144 |
+
" try:\n",
|
| 145 |
+
" atc_code = drug.get('atc-codes', dict()).get('atc-code', None)\n",
|
| 146 |
+
" atc_code = atc_code[0]\n",
|
| 147 |
+
" except:\n",
|
| 148 |
+
" pass\n",
|
| 149 |
+
"\n",
|
| 150 |
+
" row['level4'] = None\n",
|
| 151 |
+
" row['level3'] = None\n",
|
| 152 |
+
" row['level2'] = None\n",
|
| 153 |
+
" row['level1'] = None\n",
|
| 154 |
+
" try:\n",
|
| 155 |
+
" row['level4'] = atc_code['level'][0]['@code']\n",
|
| 156 |
+
" row['level3'] = atc_code['level'][1]['@code']\n",
|
| 157 |
+
" row['level2'] = atc_code['level'][2]['@code']\n",
|
| 158 |
+
" row['level1'] = atc_code['level'][3]['@code']\n",
|
| 159 |
+
" except:\n",
|
| 160 |
+
" pass\n",
|
| 161 |
+
"\n",
|
| 162 |
+
" \n",
|
| 163 |
+
" try:\n",
|
| 164 |
+
" exp_props = drug['experimental-properties']['property']\n",
|
| 165 |
+
" except:\n",
|
| 166 |
+
" exp_props = None\n",
|
| 167 |
+
" getProperties(desired_props_exp, exp_props, row)\n",
|
| 168 |
+
"\n",
|
| 169 |
+
" try:\n",
|
| 170 |
+
" calc_props = drug['calculated-properties']['property']\n",
|
| 171 |
+
" except:\n",
|
| 172 |
+
" calc_props = None\n",
|
| 173 |
+
" getProperties(desired_props_calc, calc_props, row)\n",
|
| 174 |
+
"\n",
|
| 175 |
+
" rows.append(row)"
|
| 176 |
+
]
|
| 177 |
+
},
|
| 178 |
+
{
|
| 179 |
+
"cell_type": "code",
|
| 180 |
+
"execution_count": 175,
|
| 181 |
+
"metadata": {},
|
| 182 |
+
"outputs": [
|
| 183 |
+
{
|
| 184 |
+
"data": {
|
| 185 |
+
"text/html": [
|
| 186 |
+
"<div>\n",
|
| 187 |
+
"<style scoped>\n",
|
| 188 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
| 189 |
+
" vertical-align: middle;\n",
|
| 190 |
+
" }\n",
|
| 191 |
+
"\n",
|
| 192 |
+
" .dataframe tbody tr th {\n",
|
| 193 |
+
" vertical-align: top;\n",
|
| 194 |
+
" }\n",
|
| 195 |
+
"\n",
|
| 196 |
+
" .dataframe thead th {\n",
|
| 197 |
+
" text-align: right;\n",
|
| 198 |
+
" }\n",
|
| 199 |
+
"</style>\n",
|
| 200 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
| 201 |
+
" <thead>\n",
|
| 202 |
+
" <tr style=\"text-align: right;\">\n",
|
| 203 |
+
" <th></th>\n",
|
| 204 |
+
" <th>name</th>\n",
|
| 205 |
+
" <th>state</th>\n",
|
| 206 |
+
" <th>level4</th>\n",
|
| 207 |
+
" <th>level3</th>\n",
|
| 208 |
+
" <th>level2</th>\n",
|
| 209 |
+
" <th>level1</th>\n",
|
| 210 |
+
" <th>Hydrophobicity</th>\n",
|
| 211 |
+
" <th>Boiling Point</th>\n",
|
| 212 |
+
" <th>Molecular Weight</th>\n",
|
| 213 |
+
" <th>Isoelectric Point</th>\n",
|
| 214 |
+
" <th>...</th>\n",
|
| 215 |
+
" <th>Polar Surface Area (PSA)</th>\n",
|
| 216 |
+
" <th>Veber's Rule</th>\n",
|
| 217 |
+
" <th>pKa (strongest basic)</th>\n",
|
| 218 |
+
" <th>Ghose Filter</th>\n",
|
| 219 |
+
" <th>Monoisotopic Weight</th>\n",
|
| 220 |
+
" <th>MDDR-Like Rule</th>\n",
|
| 221 |
+
" <th>Polarizability</th>\n",
|
| 222 |
+
" <th>H Bond Acceptor Count</th>\n",
|
| 223 |
+
" <th>Physiological Charge</th>\n",
|
| 224 |
+
" <th>Rule of Five</th>\n",
|
| 225 |
+
" </tr>\n",
|
| 226 |
+
" </thead>\n",
|
| 227 |
+
" <tbody>\n",
|
| 228 |
+
" <tr>\n",
|
| 229 |
+
" <th>0</th>\n",
|
| 230 |
+
" <td>Lepirudin</td>\n",
|
| 231 |
+
" <td>solid</td>\n",
|
| 232 |
+
" <td>B01AE</td>\n",
|
| 233 |
+
" <td>B01A</td>\n",
|
| 234 |
+
" <td>B01</td>\n",
|
| 235 |
+
" <td>B</td>\n",
|
| 236 |
+
" <td>NaN</td>\n",
|
| 237 |
+
" <td>NaN</td>\n",
|
| 238 |
+
" <td>NaN</td>\n",
|
| 239 |
+
" <td>NaN</td>\n",
|
| 240 |
+
" <td>...</td>\n",
|
| 241 |
+
" <td>NaN</td>\n",
|
| 242 |
+
" <td>None</td>\n",
|
| 243 |
+
" <td>NaN</td>\n",
|
| 244 |
+
" <td>NaN</td>\n",
|
| 245 |
+
" <td>NaN</td>\n",
|
| 246 |
+
" <td>NaN</td>\n",
|
| 247 |
+
" <td>NaN</td>\n",
|
| 248 |
+
" <td>NaN</td>\n",
|
| 249 |
+
" <td>NaN</td>\n",
|
| 250 |
+
" <td>NaN</td>\n",
|
| 251 |
+
" </tr>\n",
|
| 252 |
+
" <tr>\n",
|
| 253 |
+
" <th>1</th>\n",
|
| 254 |
+
" <td>Cetuximab</td>\n",
|
| 255 |
+
" <td>liquid</td>\n",
|
| 256 |
+
" <td>L01FE</td>\n",
|
| 257 |
+
" <td>L01F</td>\n",
|
| 258 |
+
" <td>L01</td>\n",
|
| 259 |
+
" <td>L</td>\n",
|
| 260 |
+
" <td>-0.413</td>\n",
|
| 261 |
+
" <td>NaN</td>\n",
|
| 262 |
+
" <td>145781.6000</td>\n",
|
| 263 |
+
" <td>8.48</td>\n",
|
| 264 |
+
" <td>...</td>\n",
|
| 265 |
+
" <td>NaN</td>\n",
|
| 266 |
+
" <td>None</td>\n",
|
| 267 |
+
" <td>NaN</td>\n",
|
| 268 |
+
" <td>NaN</td>\n",
|
| 269 |
+
" <td>NaN</td>\n",
|
| 270 |
+
" <td>NaN</td>\n",
|
| 271 |
+
" <td>NaN</td>\n",
|
| 272 |
+
" <td>NaN</td>\n",
|
| 273 |
+
" <td>NaN</td>\n",
|
| 274 |
+
" <td>NaN</td>\n",
|
| 275 |
+
" </tr>\n",
|
| 276 |
+
" <tr>\n",
|
| 277 |
+
" <th>2</th>\n",
|
| 278 |
+
" <td>Dornase alfa</td>\n",
|
| 279 |
+
" <td>liquid</td>\n",
|
| 280 |
+
" <td>R05CB</td>\n",
|
| 281 |
+
" <td>R05C</td>\n",
|
| 282 |
+
" <td>R05</td>\n",
|
| 283 |
+
" <td>R</td>\n",
|
| 284 |
+
" <td>-0.083</td>\n",
|
| 285 |
+
" <td>NaN</td>\n",
|
| 286 |
+
" <td>29253.9000</td>\n",
|
| 287 |
+
" <td>4.58</td>\n",
|
| 288 |
+
" <td>...</td>\n",
|
| 289 |
+
" <td>NaN</td>\n",
|
| 290 |
+
" <td>None</td>\n",
|
| 291 |
+
" <td>NaN</td>\n",
|
| 292 |
+
" <td>NaN</td>\n",
|
| 293 |
+
" <td>NaN</td>\n",
|
| 294 |
+
" <td>NaN</td>\n",
|
| 295 |
+
" <td>NaN</td>\n",
|
| 296 |
+
" <td>NaN</td>\n",
|
| 297 |
+
" <td>NaN</td>\n",
|
| 298 |
+
" <td>NaN</td>\n",
|
| 299 |
+
" </tr>\n",
|
| 300 |
+
" <tr>\n",
|
| 301 |
+
" <th>3</th>\n",
|
| 302 |
+
" <td>Denileukin diftitox</td>\n",
|
| 303 |
+
" <td>liquid</td>\n",
|
| 304 |
+
" <td>L01XX</td>\n",
|
| 305 |
+
" <td>L01X</td>\n",
|
| 306 |
+
" <td>L01</td>\n",
|
| 307 |
+
" <td>L</td>\n",
|
| 308 |
+
" <td>-0.301</td>\n",
|
| 309 |
+
" <td>NaN</td>\n",
|
| 310 |
+
" <td>57647.3000</td>\n",
|
| 311 |
+
" <td>5.45</td>\n",
|
| 312 |
+
" <td>...</td>\n",
|
| 313 |
+
" <td>NaN</td>\n",
|
| 314 |
+
" <td>None</td>\n",
|
| 315 |
+
" <td>NaN</td>\n",
|
| 316 |
+
" <td>NaN</td>\n",
|
| 317 |
+
" <td>NaN</td>\n",
|
| 318 |
+
" <td>NaN</td>\n",
|
| 319 |
+
" <td>NaN</td>\n",
|
| 320 |
+
" <td>NaN</td>\n",
|
| 321 |
+
" <td>NaN</td>\n",
|
| 322 |
+
" <td>NaN</td>\n",
|
| 323 |
+
" </tr>\n",
|
| 324 |
+
" <tr>\n",
|
| 325 |
+
" <th>4</th>\n",
|
| 326 |
+
" <td>Etanercept</td>\n",
|
| 327 |
+
" <td>liquid</td>\n",
|
| 328 |
+
" <td>L04AB</td>\n",
|
| 329 |
+
" <td>L04A</td>\n",
|
| 330 |
+
" <td>L04</td>\n",
|
| 331 |
+
" <td>L</td>\n",
|
| 332 |
+
" <td>-0.529</td>\n",
|
| 333 |
+
" <td>NaN</td>\n",
|
| 334 |
+
" <td>51234.9000</td>\n",
|
| 335 |
+
" <td>7.89</td>\n",
|
| 336 |
+
" <td>...</td>\n",
|
| 337 |
+
" <td>NaN</td>\n",
|
| 338 |
+
" <td>None</td>\n",
|
| 339 |
+
" <td>NaN</td>\n",
|
| 340 |
+
" <td>NaN</td>\n",
|
| 341 |
+
" <td>NaN</td>\n",
|
| 342 |
+
" <td>NaN</td>\n",
|
| 343 |
+
" <td>NaN</td>\n",
|
| 344 |
+
" <td>NaN</td>\n",
|
| 345 |
+
" <td>NaN</td>\n",
|
| 346 |
+
" <td>NaN</td>\n",
|
| 347 |
+
" </tr>\n",
|
| 348 |
+
" <tr>\n",
|
| 349 |
+
" <th>...</th>\n",
|
| 350 |
+
" <td>...</td>\n",
|
| 351 |
+
" <td>...</td>\n",
|
| 352 |
+
" <td>...</td>\n",
|
| 353 |
+
" <td>...</td>\n",
|
| 354 |
+
" <td>...</td>\n",
|
| 355 |
+
" <td>...</td>\n",
|
| 356 |
+
" <td>...</td>\n",
|
| 357 |
+
" <td>...</td>\n",
|
| 358 |
+
" <td>...</td>\n",
|
| 359 |
+
" <td>...</td>\n",
|
| 360 |
+
" <td>...</td>\n",
|
| 361 |
+
" <td>...</td>\n",
|
| 362 |
+
" <td>...</td>\n",
|
| 363 |
+
" <td>...</td>\n",
|
| 364 |
+
" <td>...</td>\n",
|
| 365 |
+
" <td>...</td>\n",
|
| 366 |
+
" <td>...</td>\n",
|
| 367 |
+
" <td>...</td>\n",
|
| 368 |
+
" <td>...</td>\n",
|
| 369 |
+
" <td>...</td>\n",
|
| 370 |
+
" <td>...</td>\n",
|
| 371 |
+
" </tr>\n",
|
| 372 |
+
" <tr>\n",
|
| 373 |
+
" <th>15230</th>\n",
|
| 374 |
+
" <td>AUM-601</td>\n",
|
| 375 |
+
" <td>None</td>\n",
|
| 376 |
+
" <td>None</td>\n",
|
| 377 |
+
" <td>None</td>\n",
|
| 378 |
+
" <td>None</td>\n",
|
| 379 |
+
" <td>None</td>\n",
|
| 380 |
+
" <td>NaN</td>\n",
|
| 381 |
+
" <td>NaN</td>\n",
|
| 382 |
+
" <td>NaN</td>\n",
|
| 383 |
+
" <td>NaN</td>\n",
|
| 384 |
+
" <td>...</td>\n",
|
| 385 |
+
" <td>NaN</td>\n",
|
| 386 |
+
" <td>None</td>\n",
|
| 387 |
+
" <td>NaN</td>\n",
|
| 388 |
+
" <td>NaN</td>\n",
|
| 389 |
+
" <td>NaN</td>\n",
|
| 390 |
+
" <td>NaN</td>\n",
|
| 391 |
+
" <td>NaN</td>\n",
|
| 392 |
+
" <td>NaN</td>\n",
|
| 393 |
+
" <td>NaN</td>\n",
|
| 394 |
+
" <td>NaN</td>\n",
|
| 395 |
+
" </tr>\n",
|
| 396 |
+
" <tr>\n",
|
| 397 |
+
" <th>15231</th>\n",
|
| 398 |
+
" <td>FN-1501</td>\n",
|
| 399 |
+
" <td>None</td>\n",
|
| 400 |
+
" <td>None</td>\n",
|
| 401 |
+
" <td>None</td>\n",
|
| 402 |
+
" <td>None</td>\n",
|
| 403 |
+
" <td>None</td>\n",
|
| 404 |
+
" <td>NaN</td>\n",
|
| 405 |
+
" <td>NaN</td>\n",
|
| 406 |
+
" <td>431.5040</td>\n",
|
| 407 |
+
" <td>NaN</td>\n",
|
| 408 |
+
" <td>...</td>\n",
|
| 409 |
+
" <td>NaN</td>\n",
|
| 410 |
+
" <td>None</td>\n",
|
| 411 |
+
" <td>NaN</td>\n",
|
| 412 |
+
" <td>NaN</td>\n",
|
| 413 |
+
" <td>431.218206</td>\n",
|
| 414 |
+
" <td>NaN</td>\n",
|
| 415 |
+
" <td>NaN</td>\n",
|
| 416 |
+
" <td>NaN</td>\n",
|
| 417 |
+
" <td>NaN</td>\n",
|
| 418 |
+
" <td>NaN</td>\n",
|
| 419 |
+
" </tr>\n",
|
| 420 |
+
" <tr>\n",
|
| 421 |
+
" <th>15232</th>\n",
|
| 422 |
+
" <td>Tinengotinib</td>\n",
|
| 423 |
+
" <td>None</td>\n",
|
| 424 |
+
" <td>None</td>\n",
|
| 425 |
+
" <td>None</td>\n",
|
| 426 |
+
" <td>None</td>\n",
|
| 427 |
+
" <td>None</td>\n",
|
| 428 |
+
" <td>NaN</td>\n",
|
| 429 |
+
" <td>NaN</td>\n",
|
| 430 |
+
" <td>394.8600</td>\n",
|
| 431 |
+
" <td>NaN</td>\n",
|
| 432 |
+
" <td>...</td>\n",
|
| 433 |
+
" <td>NaN</td>\n",
|
| 434 |
+
" <td>None</td>\n",
|
| 435 |
+
" <td>NaN</td>\n",
|
| 436 |
+
" <td>NaN</td>\n",
|
| 437 |
+
" <td>394.130887</td>\n",
|
| 438 |
+
" <td>NaN</td>\n",
|
| 439 |
+
" <td>NaN</td>\n",
|
| 440 |
+
" <td>NaN</td>\n",
|
| 441 |
+
" <td>NaN</td>\n",
|
| 442 |
+
" <td>NaN</td>\n",
|
| 443 |
+
" </tr>\n",
|
| 444 |
+
" <tr>\n",
|
| 445 |
+
" <th>15233</th>\n",
|
| 446 |
+
" <td>Lipotecan</td>\n",
|
| 447 |
+
" <td>None</td>\n",
|
| 448 |
+
" <td>None</td>\n",
|
| 449 |
+
" <td>None</td>\n",
|
| 450 |
+
" <td>None</td>\n",
|
| 451 |
+
" <td>None</td>\n",
|
| 452 |
+
" <td>NaN</td>\n",
|
| 453 |
+
" <td>NaN</td>\n",
|
| 454 |
+
" <td>850.7100</td>\n",
|
| 455 |
+
" <td>NaN</td>\n",
|
| 456 |
+
" <td>...</td>\n",
|
| 457 |
+
" <td>NaN</td>\n",
|
| 458 |
+
" <td>None</td>\n",
|
| 459 |
+
" <td>NaN</td>\n",
|
| 460 |
+
" <td>NaN</td>\n",
|
| 461 |
+
" <td>850.183062</td>\n",
|
| 462 |
+
" <td>NaN</td>\n",
|
| 463 |
+
" <td>NaN</td>\n",
|
| 464 |
+
" <td>NaN</td>\n",
|
| 465 |
+
" <td>NaN</td>\n",
|
| 466 |
+
" <td>NaN</td>\n",
|
| 467 |
+
" </tr>\n",
|
| 468 |
+
" <tr>\n",
|
| 469 |
+
" <th>15234</th>\n",
|
| 470 |
+
" <td>Xenon Xe-129</td>\n",
|
| 471 |
+
" <td>None</td>\n",
|
| 472 |
+
" <td>None</td>\n",
|
| 473 |
+
" <td>None</td>\n",
|
| 474 |
+
" <td>None</td>\n",
|
| 475 |
+
" <td>None</td>\n",
|
| 476 |
+
" <td>NaN</td>\n",
|
| 477 |
+
" <td>NaN</td>\n",
|
| 478 |
+
" <td>128.9048</td>\n",
|
| 479 |
+
" <td>NaN</td>\n",
|
| 480 |
+
" <td>...</td>\n",
|
| 481 |
+
" <td>NaN</td>\n",
|
| 482 |
+
" <td>None</td>\n",
|
| 483 |
+
" <td>NaN</td>\n",
|
| 484 |
+
" <td>NaN</td>\n",
|
| 485 |
+
" <td>128.904781</td>\n",
|
| 486 |
+
" <td>NaN</td>\n",
|
| 487 |
+
" <td>NaN</td>\n",
|
| 488 |
+
" <td>NaN</td>\n",
|
| 489 |
+
" <td>NaN</td>\n",
|
| 490 |
+
" <td>NaN</td>\n",
|
| 491 |
+
" </tr>\n",
|
| 492 |
+
" </tbody>\n",
|
| 493 |
+
"</table>\n",
|
| 494 |
+
"<p>15235 rows × 33 columns</p>\n",
|
| 495 |
+
"</div>"
|
| 496 |
+
],
|
| 497 |
+
"text/plain": [
|
| 498 |
+
" name state level4 level3 level2 level1 \\\n",
|
| 499 |
+
"0 Lepirudin solid B01AE B01A B01 B \n",
|
| 500 |
+
"1 Cetuximab liquid L01FE L01F L01 L \n",
|
| 501 |
+
"2 Dornase alfa liquid R05CB R05C R05 R \n",
|
| 502 |
+
"3 Denileukin diftitox liquid L01XX L01X L01 L \n",
|
| 503 |
+
"4 Etanercept liquid L04AB L04A L04 L \n",
|
| 504 |
+
"... ... ... ... ... ... ... \n",
|
| 505 |
+
"15230 AUM-601 None None None None None \n",
|
| 506 |
+
"15231 FN-1501 None None None None None \n",
|
| 507 |
+
"15232 Tinengotinib None None None None None \n",
|
| 508 |
+
"15233 Lipotecan None None None None None \n",
|
| 509 |
+
"15234 Xenon Xe-129 None None None None None \n",
|
| 510 |
+
"\n",
|
| 511 |
+
" Hydrophobicity Boiling Point Molecular Weight Isoelectric Point \\\n",
|
| 512 |
+
"0 NaN NaN NaN NaN \n",
|
| 513 |
+
"1 -0.413 NaN 145781.6000 8.48 \n",
|
| 514 |
+
"2 -0.083 NaN 29253.9000 4.58 \n",
|
| 515 |
+
"3 -0.301 NaN 57647.3000 5.45 \n",
|
| 516 |
+
"4 -0.529 NaN 51234.9000 7.89 \n",
|
| 517 |
+
"... ... ... ... ... \n",
|
| 518 |
+
"15230 NaN NaN NaN NaN \n",
|
| 519 |
+
"15231 NaN NaN 431.5040 NaN \n",
|
| 520 |
+
"15232 NaN NaN 394.8600 NaN \n",
|
| 521 |
+
"15233 NaN NaN 850.7100 NaN \n",
|
| 522 |
+
"15234 NaN NaN 128.9048 NaN \n",
|
| 523 |
+
"\n",
|
| 524 |
+
" ... Polar Surface Area (PSA) Veber's Rule pKa (strongest basic) \\\n",
|
| 525 |
+
"0 ... NaN None NaN \n",
|
| 526 |
+
"1 ... NaN None NaN \n",
|
| 527 |
+
"2 ... NaN None NaN \n",
|
| 528 |
+
"3 ... NaN None NaN \n",
|
| 529 |
+
"4 ... NaN None NaN \n",
|
| 530 |
+
"... ... ... ... ... \n",
|
| 531 |
+
"15230 ... NaN None NaN \n",
|
| 532 |
+
"15231 ... NaN None NaN \n",
|
| 533 |
+
"15232 ... NaN None NaN \n",
|
| 534 |
+
"15233 ... NaN None NaN \n",
|
| 535 |
+
"15234 ... NaN None NaN \n",
|
| 536 |
+
"\n",
|
| 537 |
+
" Ghose Filter Monoisotopic Weight MDDR-Like Rule Polarizability \\\n",
|
| 538 |
+
"0 NaN NaN NaN NaN \n",
|
| 539 |
+
"1 NaN NaN NaN NaN \n",
|
| 540 |
+
"2 NaN NaN NaN NaN \n",
|
| 541 |
+
"3 NaN NaN NaN NaN \n",
|
| 542 |
+
"4 NaN NaN NaN NaN \n",
|
| 543 |
+
"... ... ... ... ... \n",
|
| 544 |
+
"15230 NaN NaN NaN NaN \n",
|
| 545 |
+
"15231 NaN 431.218206 NaN NaN \n",
|
| 546 |
+
"15232 NaN 394.130887 NaN NaN \n",
|
| 547 |
+
"15233 NaN 850.183062 NaN NaN \n",
|
| 548 |
+
"15234 NaN 128.904781 NaN NaN \n",
|
| 549 |
+
"\n",
|
| 550 |
+
" H Bond Acceptor Count Physiological Charge Rule of Five \n",
|
| 551 |
+
"0 NaN NaN NaN \n",
|
| 552 |
+
"1 NaN NaN NaN \n",
|
| 553 |
+
"2 NaN NaN NaN \n",
|
| 554 |
+
"3 NaN NaN NaN \n",
|
| 555 |
+
"4 NaN NaN NaN \n",
|
| 556 |
+
"... ... ... ... \n",
|
| 557 |
+
"15230 NaN NaN NaN \n",
|
| 558 |
+
"15231 NaN NaN NaN \n",
|
| 559 |
+
"15232 NaN NaN NaN \n",
|
| 560 |
+
"15233 NaN NaN NaN \n",
|
| 561 |
+
"15234 NaN NaN NaN \n",
|
| 562 |
+
"\n",
|
| 563 |
+
"[15235 rows x 33 columns]"
|
| 564 |
+
]
|
| 565 |
+
},
|
| 566 |
+
"execution_count": 175,
|
| 567 |
+
"metadata": {},
|
| 568 |
+
"output_type": "execute_result"
|
| 569 |
+
}
|
| 570 |
+
],
|
| 571 |
+
"source": [
|
| 572 |
+
"drugbank_df = pandas.DataFrame.from_dict(rows)\n",
|
| 573 |
+
"drugbank_df.to_csv(\"data/full_database.csv\")\n",
|
| 574 |
+
"drugbank_df"
|
| 575 |
+
]
|
| 576 |
+
},
|
| 577 |
+
{
|
| 578 |
+
"cell_type": "code",
|
| 579 |
+
"execution_count": 181,
|
| 580 |
+
"metadata": {},
|
| 581 |
+
"outputs": [],
|
| 582 |
+
"source": [
|
| 583 |
+
"threshold = 10\n",
|
| 584 |
+
"df = drugbank_df.dropna(thresh=drugbank_df.shape[1] - threshold + 1)\n",
|
| 585 |
+
"df = df.dropna(axis=1, thresh=df.shape[0]-1000+1)"
|
| 586 |
+
]
|
| 587 |
+
},
|
| 588 |
+
{
|
| 589 |
+
"cell_type": "code",
|
| 590 |
+
"execution_count": 182,
|
| 591 |
+
"metadata": {},
|
| 592 |
+
"outputs": [
|
| 593 |
+
{
|
| 594 |
+
"data": {
|
| 595 |
+
"text/plain": [
|
| 596 |
+
"name 0\n",
|
| 597 |
+
"state 549\n",
|
| 598 |
+
"level4 35\n",
|
| 599 |
+
"level3 35\n",
|
| 600 |
+
"level2 35\n",
|
| 601 |
+
"level1 35\n",
|
| 602 |
+
"Molecular Weight 0\n",
|
| 603 |
+
"logP 0\n",
|
| 604 |
+
"Water Solubility 5\n",
|
| 605 |
+
"logS 27\n",
|
| 606 |
+
"Bioavailability 0\n",
|
| 607 |
+
"pKa (strongest acidic) 394\n",
|
| 608 |
+
"Refractivity 0\n",
|
| 609 |
+
"Number of Rings 0\n",
|
| 610 |
+
"H Bond Donor Count 0\n",
|
| 611 |
+
"Rotatable Bond Count 0\n",
|
| 612 |
+
"Polar Surface Area (PSA) 0\n",
|
| 613 |
+
"pKa (strongest basic) 110\n",
|
| 614 |
+
"Ghose Filter 0\n",
|
| 615 |
+
"Monoisotopic Weight 0\n",
|
| 616 |
+
"MDDR-Like Rule 0\n",
|
| 617 |
+
"Polarizability 0\n",
|
| 618 |
+
"H Bond Acceptor Count 0\n",
|
| 619 |
+
"Physiological Charge 0\n",
|
| 620 |
+
"Rule of Five 0\n",
|
| 621 |
+
"dtype: int64"
|
| 622 |
+
]
|
| 623 |
+
},
|
| 624 |
+
"execution_count": 182,
|
| 625 |
+
"metadata": {},
|
| 626 |
+
"output_type": "execute_result"
|
| 627 |
+
}
|
| 628 |
+
],
|
| 629 |
+
"source": [
|
| 630 |
+
"df.isna().sum()"
|
| 631 |
+
]
|
| 632 |
+
},
|
| 633 |
+
{
|
| 634 |
+
"cell_type": "code",
|
| 635 |
+
"execution_count": 183,
|
| 636 |
+
"metadata": {},
|
| 637 |
+
"outputs": [],
|
| 638 |
+
"source": [
|
| 639 |
+
"df.to_csv('data/filtered_dataset.csv')"
|
| 640 |
+
]
|
| 641 |
+
},
|
| 642 |
+
{
|
| 643 |
+
"cell_type": "code",
|
| 644 |
+
"execution_count": 184,
|
| 645 |
+
"metadata": {},
|
| 646 |
+
"outputs": [],
|
| 647 |
+
"source": [
|
| 648 |
+
"interactions = {}\n",
|
| 649 |
+
"# get the set of drugs in the filtered df\n",
|
| 650 |
+
"drugs = set(df[\"name\"])\n",
|
| 651 |
+
"\n",
|
| 652 |
+
"for i in range(15235):\n",
|
| 653 |
+
" drug = data['drugbank']['drug'][i]\n",
|
| 654 |
+
" \n",
|
| 655 |
+
" if drug.get(\"name\", None) in drugs:\n",
|
| 656 |
+
" try:\n",
|
| 657 |
+
" interactions[drug['name']] = [x['name'] for x in drug['drug-interactions'][\"drug-interaction\"] if x['name'] in drugs]\n",
|
| 658 |
+
" except:\n",
|
| 659 |
+
" interactions[drug['name']] = []"
|
| 660 |
+
]
|
| 661 |
+
},
|
| 662 |
+
{
|
| 663 |
+
"cell_type": "code",
|
| 664 |
+
"execution_count": 185,
|
| 665 |
+
"metadata": {},
|
| 666 |
+
"outputs": [],
|
| 667 |
+
"source": [
|
| 668 |
+
"json_obj = json.dumps(interactions, indent=4)\n",
|
| 669 |
+
"\n",
|
| 670 |
+
"# output as json\n",
|
| 671 |
+
"with open(\"data/interactions.json\", \"w\") as outfile:\n",
|
| 672 |
+
" outfile.write(json_obj)"
|
| 673 |
+
]
|
| 674 |
+
}
|
| 675 |
+
],
|
| 676 |
+
"metadata": {
|
| 677 |
+
"anaconda-cloud": {},
|
| 678 |
+
"kernelspec": {
|
| 679 |
+
"display_name": "Python [default]",
|
| 680 |
+
"language": "python",
|
| 681 |
+
"name": "python3"
|
| 682 |
+
},
|
| 683 |
+
"language_info": {
|
| 684 |
+
"codemirror_mode": {
|
| 685 |
+
"name": "ipython",
|
| 686 |
+
"version": 3
|
| 687 |
+
},
|
| 688 |
+
"file_extension": ".py",
|
| 689 |
+
"mimetype": "text/x-python",
|
| 690 |
+
"name": "python",
|
| 691 |
+
"nbconvert_exporter": "python",
|
| 692 |
+
"pygments_lexer": "ipython3",
|
| 693 |
+
"version": "3.8.8"
|
| 694 |
+
}
|
| 695 |
+
},
|
| 696 |
+
"nbformat": 4,
|
| 697 |
+
"nbformat_minor": 0
|
| 698 |
+
}
|