bprimal commited on
Commit
c8e9e79
·
1 Parent(s): 857e14e

Upload 13 files

Browse files
.gitattributes CHANGED
@@ -32,3 +32,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
32
  *.zip filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
32
  *.zip filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
35
+ catboost/datasets/interactions.json filter=lfs diff=lfs merge=lfs -text
36
+ catboost/models/catboost_model2.cbm filter=lfs diff=lfs merge=lfs -text
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2023 BP Rimal
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README.md ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ # Drug-Drug-Interaction-Classification
2
+ Drug to Drug Interaction Classifier
3
+
4
+ An innovative approach was developed to address a crucial challenge in drug-drug interaction research. While existing state of the art link prediction models rely on prior knowledge of a drug's interaction with other drugs, our solution utilizes the CatBoost to classify potential interactions based solely on intrinsic properties.
5
+
6
+ We developed a new method for predicting drug interactions using the CatBoost algorithm that relies solely on intrinsic properties, rather than prior knowledge of a drug's interactions. We achieved a high accuracy of 0.85 and an AUC-ROC score of 0.86. This breakthrough provides a more efficient and cost-effective approach to predicting drug interactions, particularly for new drugs without prior interaction data.
catboost/adjacency_preprocess.ipynb ADDED
File without changes
catboost/datasets/filtered_dataset.csv ADDED
The diff for this file is too large to render. See raw diff
 
catboost/datasets/interactions.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1ef9c2162886244241c98e32cba62fd929f18f453273209c15408545b7c33b5c
3
+ size 32756785
catboost/inference.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from sklearn.model_selection import train_test_split
3
+ from catboost import CatBoostClassifier, Pool
4
+ from sklearn.metrics import roc_auc_score
5
+ from sklearn.metrics import accuracy_score
6
+ from pandas.core.common import random_state
7
+ import numpy as np
8
+
9
+ # load catboost_df
10
+ catboost_df = pd.read_csv('datasets/catboost_df.csv', index_col=0)
11
+ # drop label name_x and name_y
12
+ catboost_df = catboost_df.drop(['name_x', 'name_y'], axis=1)
13
+ # get the categorical and float features
14
+ cat_features = list(catboost_df.select_dtypes(include=['object']).columns)
15
+ float_features = list(catboost_df.select_dtypes(include=['float64']).columns)
16
+
17
+ for feature in float_features:
18
+ # Fill NaN values with the mean of non-missing values in the same column
19
+ mean_value = catboost_df[feature].mean()
20
+ catboost_df[feature].fillna(mean_value, inplace=True)
21
+
22
+ for feature in cat_features:
23
+ catboost_df[feature] = catboost_df[feature].astype(str)
24
+
25
+ # create test and train set
26
+ X, y = catboost_df.drop('interaction', axis=1), catboost_df['interaction']
27
+ X_train, X_test, y_train, y_test = train_test_split(
28
+ X, y, test_size=0.6, random_state=42)
29
+
30
+ inference = CatBoostClassifier()
31
+ inference.load_model("models/catboost_model2.cbm")
32
+
33
+ y_pred = inference.predict_proba(X_test)
34
+ y_pred = y_pred[:, 1]
35
+ y_pred_binary = np.where(y_pred > 0.5, 1, 0)
36
+ print(f"Test AUC_ROC score = {roc_auc_score(y_test, y_pred)}")
37
+ print(f"Accuracy Score= {accuracy_score(y_test, y_pred_binary)}")
catboost/models/catboost_model.cbm ADDED
Binary file (303 kB). View file
 
catboost/models/catboost_model2.cbm ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:61dc8e9631740294617bf90947493b3955b6032e453cd3aa7e9a8f9d28d7f292
3
+ size 1186180
catboost/preprocess_catboost.ipynb ADDED
@@ -0,0 +1,1858 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 2,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "import pandas as pd"
10
+ ]
11
+ },
12
+ {
13
+ "cell_type": "code",
14
+ "execution_count": 7,
15
+ "metadata": {},
16
+ "outputs": [
17
+ {
18
+ "data": {
19
+ "text/html": [
20
+ "<div>\n",
21
+ "<style scoped>\n",
22
+ " .dataframe tbody tr th:only-of-type {\n",
23
+ " vertical-align: middle;\n",
24
+ " }\n",
25
+ "\n",
26
+ " .dataframe tbody tr th {\n",
27
+ " vertical-align: top;\n",
28
+ " }\n",
29
+ "\n",
30
+ " .dataframe thead th {\n",
31
+ " text-align: right;\n",
32
+ " }\n",
33
+ "</style>\n",
34
+ "<table border=\"1\" class=\"dataframe\">\n",
35
+ " <thead>\n",
36
+ " <tr style=\"text-align: right;\">\n",
37
+ " <th></th>\n",
38
+ " <th>name</th>\n",
39
+ " <th>state</th>\n",
40
+ " <th>level4</th>\n",
41
+ " <th>level3</th>\n",
42
+ " <th>level2</th>\n",
43
+ " <th>level1</th>\n",
44
+ " <th>Molecular Weight</th>\n",
45
+ " <th>logP</th>\n",
46
+ " <th>Water Solubility</th>\n",
47
+ " <th>logS</th>\n",
48
+ " <th>...</th>\n",
49
+ " <th>Rotatable Bond Count</th>\n",
50
+ " <th>Polar Surface Area (PSA)</th>\n",
51
+ " <th>pKa (strongest basic)</th>\n",
52
+ " <th>Ghose Filter</th>\n",
53
+ " <th>Monoisotopic Weight</th>\n",
54
+ " <th>MDDR-Like Rule</th>\n",
55
+ " <th>Polarizability</th>\n",
56
+ " <th>H Bond Acceptor Count</th>\n",
57
+ " <th>Physiological Charge</th>\n",
58
+ " <th>Rule of Five</th>\n",
59
+ " </tr>\n",
60
+ " </thead>\n",
61
+ " <tbody>\n",
62
+ " <tr>\n",
63
+ " <th>5</th>\n",
64
+ " <td>Bivalirudin</td>\n",
65
+ " <td>solid</td>\n",
66
+ " <td>B01AE</td>\n",
67
+ " <td>B01A</td>\n",
68
+ " <td>B01</td>\n",
69
+ " <td>B</td>\n",
70
+ " <td>2180.2853</td>\n",
71
+ " <td>-14.00</td>\n",
72
+ " <td>0.04640</td>\n",
73
+ " <td>-4.7</td>\n",
74
+ " <td>...</td>\n",
75
+ " <td>66.0</td>\n",
76
+ " <td>901.57</td>\n",
77
+ " <td>11.88</td>\n",
78
+ " <td>0.0</td>\n",
79
+ " <td>2178.985813</td>\n",
80
+ " <td>1.0</td>\n",
81
+ " <td>218.54</td>\n",
82
+ " <td>37.0</td>\n",
83
+ " <td>-4.0</td>\n",
84
+ " <td>0.0</td>\n",
85
+ " </tr>\n",
86
+ " <tr>\n",
87
+ " <th>6</th>\n",
88
+ " <td>Leuprolide</td>\n",
89
+ " <td>solid</td>\n",
90
+ " <td>L02AE</td>\n",
91
+ " <td>L02A</td>\n",
92
+ " <td>L02</td>\n",
93
+ " <td>L</td>\n",
94
+ " <td>1209.3983</td>\n",
95
+ " <td>-2.40</td>\n",
96
+ " <td>0.03380</td>\n",
97
+ " <td>-4.6</td>\n",
98
+ " <td>...</td>\n",
99
+ " <td>32.0</td>\n",
100
+ " <td>429.04</td>\n",
101
+ " <td>11.92</td>\n",
102
+ " <td>0.0</td>\n",
103
+ " <td>1208.645462</td>\n",
104
+ " <td>1.0</td>\n",
105
+ " <td>125.24</td>\n",
106
+ " <td>16.0</td>\n",
107
+ " <td>1.0</td>\n",
108
+ " <td>0.0</td>\n",
109
+ " </tr>\n",
110
+ " <tr>\n",
111
+ " <th>13</th>\n",
112
+ " <td>Goserelin</td>\n",
113
+ " <td>solid</td>\n",
114
+ " <td>L02AE</td>\n",
115
+ " <td>L02A</td>\n",
116
+ " <td>L02</td>\n",
117
+ " <td>L</td>\n",
118
+ " <td>1269.4105</td>\n",
119
+ " <td>-5.10</td>\n",
120
+ " <td>0.02830</td>\n",
121
+ " <td>-4.6</td>\n",
122
+ " <td>...</td>\n",
123
+ " <td>33.0</td>\n",
124
+ " <td>495.89</td>\n",
125
+ " <td>10.91</td>\n",
126
+ " <td>0.0</td>\n",
127
+ " <td>1268.641439</td>\n",
128
+ " <td>1.0</td>\n",
129
+ " <td>130.74</td>\n",
130
+ " <td>18.0</td>\n",
131
+ " <td>1.0</td>\n",
132
+ " <td>0.0</td>\n",
133
+ " </tr>\n",
134
+ " <tr>\n",
135
+ " <th>25</th>\n",
136
+ " <td>Gramicidin D</td>\n",
137
+ " <td>liquid</td>\n",
138
+ " <td>R02AB</td>\n",
139
+ " <td>R02A</td>\n",
140
+ " <td>R02</td>\n",
141
+ " <td>R</td>\n",
142
+ " <td>1811.2530</td>\n",
143
+ " <td>5.96</td>\n",
144
+ " <td>0.00390</td>\n",
145
+ " <td>-5.7</td>\n",
146
+ " <td>...</td>\n",
147
+ " <td>50.0</td>\n",
148
+ " <td>519.89</td>\n",
149
+ " <td>NaN</td>\n",
150
+ " <td>0.0</td>\n",
151
+ " <td>1810.033419</td>\n",
152
+ " <td>1.0</td>\n",
153
+ " <td>194.73</td>\n",
154
+ " <td>16.0</td>\n",
155
+ " <td>0.0</td>\n",
156
+ " <td>0.0</td>\n",
157
+ " </tr>\n",
158
+ " <tr>\n",
159
+ " <th>33</th>\n",
160
+ " <td>Desmopressin</td>\n",
161
+ " <td>solid</td>\n",
162
+ " <td>H01BA</td>\n",
163
+ " <td>H01B</td>\n",
164
+ " <td>H01</td>\n",
165
+ " <td>H</td>\n",
166
+ " <td>1069.2200</td>\n",
167
+ " <td>-6.10</td>\n",
168
+ " <td>0.11000</td>\n",
169
+ " <td>-4.0</td>\n",
170
+ " <td>...</td>\n",
171
+ " <td>19.0</td>\n",
172
+ " <td>435.41</td>\n",
173
+ " <td>11.77</td>\n",
174
+ " <td>0.0</td>\n",
175
+ " <td>1068.426956</td>\n",
176
+ " <td>1.0</td>\n",
177
+ " <td>104.78</td>\n",
178
+ " <td>15.0</td>\n",
179
+ " <td>1.0</td>\n",
180
+ " <td>0.0</td>\n",
181
+ " </tr>\n",
182
+ " <tr>\n",
183
+ " <th>47</th>\n",
184
+ " <td>Cetrorelix</td>\n",
185
+ " <td>solid</td>\n",
186
+ " <td>H01CC</td>\n",
187
+ " <td>H01C</td>\n",
188
+ " <td>H01</td>\n",
189
+ " <td>H</td>\n",
190
+ " <td>1431.0380</td>\n",
191
+ " <td>-1.70</td>\n",
192
+ " <td>0.00694</td>\n",
193
+ " <td>-5.3</td>\n",
194
+ " <td>...</td>\n",
195
+ " <td>38.0</td>\n",
196
+ " <td>495.67</td>\n",
197
+ " <td>11.79</td>\n",
198
+ " <td>0.0</td>\n",
199
+ " <td>1429.669818</td>\n",
200
+ " <td>1.0</td>\n",
201
+ " <td>148.93</td>\n",
202
+ " <td>18.0</td>\n",
203
+ " <td>1.0</td>\n",
204
+ " <td>0.0</td>\n",
205
+ " </tr>\n",
206
+ " <tr>\n",
207
+ " <th>74</th>\n",
208
+ " <td>Daptomycin</td>\n",
209
+ " <td>solid</td>\n",
210
+ " <td>J01XX</td>\n",
211
+ " <td>J01X</td>\n",
212
+ " <td>J01</td>\n",
213
+ " <td>J</td>\n",
214
+ " <td>1620.6930</td>\n",
215
+ " <td>-9.40</td>\n",
216
+ " <td>0.01730</td>\n",
217
+ " <td>-5.0</td>\n",
218
+ " <td>...</td>\n",
219
+ " <td>35.0</td>\n",
220
+ " <td>702.02</td>\n",
221
+ " <td>9.59</td>\n",
222
+ " <td>0.0</td>\n",
223
+ " <td>1619.710366</td>\n",
224
+ " <td>1.0</td>\n",
225
+ " <td>158.96</td>\n",
226
+ " <td>27.0</td>\n",
227
+ " <td>-3.0</td>\n",
228
+ " <td>0.0</td>\n",
229
+ " </tr>\n",
230
+ " <tr>\n",
231
+ " <th>97</th>\n",
232
+ " <td>Abarelix</td>\n",
233
+ " <td>solid</td>\n",
234
+ " <td>L02BX</td>\n",
235
+ " <td>L02B</td>\n",
236
+ " <td>L02</td>\n",
237
+ " <td>L</td>\n",
238
+ " <td>1416.0900</td>\n",
239
+ " <td>-0.46</td>\n",
240
+ " <td>0.00371</td>\n",
241
+ " <td>-5.6</td>\n",
242
+ " <td>...</td>\n",
243
+ " <td>38.0</td>\n",
244
+ " <td>424.98</td>\n",
245
+ " <td>10.66</td>\n",
246
+ " <td>0.0</td>\n",
247
+ " <td>1414.684072</td>\n",
248
+ " <td>1.0</td>\n",
249
+ " <td>149.31</td>\n",
250
+ " <td>16.0</td>\n",
251
+ " <td>1.0</td>\n",
252
+ " <td>0.0</td>\n",
253
+ " </tr>\n",
254
+ " <tr>\n",
255
+ " <th>105</th>\n",
256
+ " <td>Pyridoxal phosphate</td>\n",
257
+ " <td>solid</td>\n",
258
+ " <td>A11HA</td>\n",
259
+ " <td>A11H</td>\n",
260
+ " <td>A11</td>\n",
261
+ " <td>A</td>\n",
262
+ " <td>247.1419</td>\n",
263
+ " <td>-2.10</td>\n",
264
+ " <td>5.70000</td>\n",
265
+ " <td>-1.6</td>\n",
266
+ " <td>...</td>\n",
267
+ " <td>4.0</td>\n",
268
+ " <td>116.95</td>\n",
269
+ " <td>4.11</td>\n",
270
+ " <td>0.0</td>\n",
271
+ " <td>247.024574</td>\n",
272
+ " <td>0.0</td>\n",
273
+ " <td>20.90</td>\n",
274
+ " <td>6.0</td>\n",
275
+ " <td>-2.0</td>\n",
276
+ " <td>1.0</td>\n",
277
+ " </tr>\n",
278
+ " <tr>\n",
279
+ " <th>106</th>\n",
280
+ " <td>Cyanocobalamin</td>\n",
281
+ " <td>solid</td>\n",
282
+ " <td>B03BA</td>\n",
283
+ " <td>B03B</td>\n",
284
+ " <td>B03</td>\n",
285
+ " <td>B</td>\n",
286
+ " <td>1355.3652</td>\n",
287
+ " <td>-3.20</td>\n",
288
+ " <td>0.02020</td>\n",
289
+ " <td>-4.8</td>\n",
290
+ " <td>...</td>\n",
291
+ " <td>27.0</td>\n",
292
+ " <td>477.85</td>\n",
293
+ " <td>8.68</td>\n",
294
+ " <td>0.0</td>\n",
295
+ " <td>1354.567405</td>\n",
296
+ " <td>1.0</td>\n",
297
+ " <td>138.79</td>\n",
298
+ " <td>18.0</td>\n",
299
+ " <td>3.0</td>\n",
300
+ " <td>0.0</td>\n",
301
+ " </tr>\n",
302
+ " </tbody>\n",
303
+ "</table>\n",
304
+ "<p>10 rows × 25 columns</p>\n",
305
+ "</div>"
306
+ ],
307
+ "text/plain": [
308
+ " name state level4 level3 level2 level1 \\\n",
309
+ "5 Bivalirudin solid B01AE B01A B01 B \n",
310
+ "6 Leuprolide solid L02AE L02A L02 L \n",
311
+ "13 Goserelin solid L02AE L02A L02 L \n",
312
+ "25 Gramicidin D liquid R02AB R02A R02 R \n",
313
+ "33 Desmopressin solid H01BA H01B H01 H \n",
314
+ "47 Cetrorelix solid H01CC H01C H01 H \n",
315
+ "74 Daptomycin solid J01XX J01X J01 J \n",
316
+ "97 Abarelix solid L02BX L02B L02 L \n",
317
+ "105 Pyridoxal phosphate solid A11HA A11H A11 A \n",
318
+ "106 Cyanocobalamin solid B03BA B03B B03 B \n",
319
+ "\n",
320
+ " Molecular Weight logP Water Solubility logS ... \\\n",
321
+ "5 2180.2853 -14.00 0.04640 -4.7 ... \n",
322
+ "6 1209.3983 -2.40 0.03380 -4.6 ... \n",
323
+ "13 1269.4105 -5.10 0.02830 -4.6 ... \n",
324
+ "25 1811.2530 5.96 0.00390 -5.7 ... \n",
325
+ "33 1069.2200 -6.10 0.11000 -4.0 ... \n",
326
+ "47 1431.0380 -1.70 0.00694 -5.3 ... \n",
327
+ "74 1620.6930 -9.40 0.01730 -5.0 ... \n",
328
+ "97 1416.0900 -0.46 0.00371 -5.6 ... \n",
329
+ "105 247.1419 -2.10 5.70000 -1.6 ... \n",
330
+ "106 1355.3652 -3.20 0.02020 -4.8 ... \n",
331
+ "\n",
332
+ " Rotatable Bond Count Polar Surface Area (PSA) pKa (strongest basic) \\\n",
333
+ "5 66.0 901.57 11.88 \n",
334
+ "6 32.0 429.04 11.92 \n",
335
+ "13 33.0 495.89 10.91 \n",
336
+ "25 50.0 519.89 NaN \n",
337
+ "33 19.0 435.41 11.77 \n",
338
+ "47 38.0 495.67 11.79 \n",
339
+ "74 35.0 702.02 9.59 \n",
340
+ "97 38.0 424.98 10.66 \n",
341
+ "105 4.0 116.95 4.11 \n",
342
+ "106 27.0 477.85 8.68 \n",
343
+ "\n",
344
+ " Ghose Filter Monoisotopic Weight MDDR-Like Rule Polarizability \\\n",
345
+ "5 0.0 2178.985813 1.0 218.54 \n",
346
+ "6 0.0 1208.645462 1.0 125.24 \n",
347
+ "13 0.0 1268.641439 1.0 130.74 \n",
348
+ "25 0.0 1810.033419 1.0 194.73 \n",
349
+ "33 0.0 1068.426956 1.0 104.78 \n",
350
+ "47 0.0 1429.669818 1.0 148.93 \n",
351
+ "74 0.0 1619.710366 1.0 158.96 \n",
352
+ "97 0.0 1414.684072 1.0 149.31 \n",
353
+ "105 0.0 247.024574 0.0 20.90 \n",
354
+ "106 0.0 1354.567405 1.0 138.79 \n",
355
+ "\n",
356
+ " H Bond Acceptor Count Physiological Charge Rule of Five \n",
357
+ "5 37.0 -4.0 0.0 \n",
358
+ "6 16.0 1.0 0.0 \n",
359
+ "13 18.0 1.0 0.0 \n",
360
+ "25 16.0 0.0 0.0 \n",
361
+ "33 15.0 1.0 0.0 \n",
362
+ "47 18.0 1.0 0.0 \n",
363
+ "74 27.0 -3.0 0.0 \n",
364
+ "97 16.0 1.0 0.0 \n",
365
+ "105 6.0 -2.0 1.0 \n",
366
+ "106 18.0 3.0 0.0 \n",
367
+ "\n",
368
+ "[10 rows x 25 columns]"
369
+ ]
370
+ },
371
+ "execution_count": 7,
372
+ "metadata": {},
373
+ "output_type": "execute_result"
374
+ }
375
+ ],
376
+ "source": [
377
+ "# drop the first column\n",
378
+ "df = pd.read_csv('datasets/filtered_dataset.csv', index_col=0)\n",
379
+ "df.head(10)"
380
+ ]
381
+ },
382
+ {
383
+ "cell_type": "code",
384
+ "execution_count": 8,
385
+ "metadata": {},
386
+ "outputs": [
387
+ {
388
+ "data": {
389
+ "text/html": [
390
+ "<div>\n",
391
+ "<style scoped>\n",
392
+ " .dataframe tbody tr th:only-of-type {\n",
393
+ " vertical-align: middle;\n",
394
+ " }\n",
395
+ "\n",
396
+ " .dataframe tbody tr th {\n",
397
+ " vertical-align: top;\n",
398
+ " }\n",
399
+ "\n",
400
+ " .dataframe thead th {\n",
401
+ " text-align: right;\n",
402
+ " }\n",
403
+ "</style>\n",
404
+ "<table border=\"1\" class=\"dataframe\">\n",
405
+ " <thead>\n",
406
+ " <tr style=\"text-align: right;\">\n",
407
+ " <th></th>\n",
408
+ " <th>name_x</th>\n",
409
+ " <th>state_x</th>\n",
410
+ " <th>level4_x</th>\n",
411
+ " <th>level3_x</th>\n",
412
+ " <th>level2_x</th>\n",
413
+ " <th>level1_x</th>\n",
414
+ " <th>Molecular Weight_x</th>\n",
415
+ " <th>logP_x</th>\n",
416
+ " <th>Water Solubility_x</th>\n",
417
+ " <th>logS_x</th>\n",
418
+ " <th>...</th>\n",
419
+ " <th>Rotatable Bond Count_y</th>\n",
420
+ " <th>Polar Surface Area (PSA)_y</th>\n",
421
+ " <th>pKa (strongest basic)_y</th>\n",
422
+ " <th>Ghose Filter_y</th>\n",
423
+ " <th>Monoisotopic Weight_y</th>\n",
424
+ " <th>MDDR-Like Rule_y</th>\n",
425
+ " <th>Polarizability_y</th>\n",
426
+ " <th>H Bond Acceptor Count_y</th>\n",
427
+ " <th>Physiological Charge_y</th>\n",
428
+ " <th>Rule of Five_y</th>\n",
429
+ " </tr>\n",
430
+ " </thead>\n",
431
+ " <tbody>\n",
432
+ " <tr>\n",
433
+ " <th>0</th>\n",
434
+ " <td>Bivalirudin</td>\n",
435
+ " <td>solid</td>\n",
436
+ " <td>B01AE</td>\n",
437
+ " <td>B01A</td>\n",
438
+ " <td>B01</td>\n",
439
+ " <td>B</td>\n",
440
+ " <td>2180.2853</td>\n",
441
+ " <td>-14.0</td>\n",
442
+ " <td>0.0464</td>\n",
443
+ " <td>-4.7</td>\n",
444
+ " <td>...</td>\n",
445
+ " <td>66.0</td>\n",
446
+ " <td>901.57</td>\n",
447
+ " <td>11.88</td>\n",
448
+ " <td>0.0</td>\n",
449
+ " <td>2178.985813</td>\n",
450
+ " <td>1.0</td>\n",
451
+ " <td>218.54</td>\n",
452
+ " <td>37.0</td>\n",
453
+ " <td>-4.0</td>\n",
454
+ " <td>0.0</td>\n",
455
+ " </tr>\n",
456
+ " <tr>\n",
457
+ " <th>1</th>\n",
458
+ " <td>Bivalirudin</td>\n",
459
+ " <td>solid</td>\n",
460
+ " <td>B01AE</td>\n",
461
+ " <td>B01A</td>\n",
462
+ " <td>B01</td>\n",
463
+ " <td>B</td>\n",
464
+ " <td>2180.2853</td>\n",
465
+ " <td>-14.0</td>\n",
466
+ " <td>0.0464</td>\n",
467
+ " <td>-4.7</td>\n",
468
+ " <td>...</td>\n",
469
+ " <td>32.0</td>\n",
470
+ " <td>429.04</td>\n",
471
+ " <td>11.92</td>\n",
472
+ " <td>0.0</td>\n",
473
+ " <td>1208.645462</td>\n",
474
+ " <td>1.0</td>\n",
475
+ " <td>125.24</td>\n",
476
+ " <td>16.0</td>\n",
477
+ " <td>1.0</td>\n",
478
+ " <td>0.0</td>\n",
479
+ " </tr>\n",
480
+ " <tr>\n",
481
+ " <th>2</th>\n",
482
+ " <td>Bivalirudin</td>\n",
483
+ " <td>solid</td>\n",
484
+ " <td>B01AE</td>\n",
485
+ " <td>B01A</td>\n",
486
+ " <td>B01</td>\n",
487
+ " <td>B</td>\n",
488
+ " <td>2180.2853</td>\n",
489
+ " <td>-14.0</td>\n",
490
+ " <td>0.0464</td>\n",
491
+ " <td>-4.7</td>\n",
492
+ " <td>...</td>\n",
493
+ " <td>33.0</td>\n",
494
+ " <td>495.89</td>\n",
495
+ " <td>10.91</td>\n",
496
+ " <td>0.0</td>\n",
497
+ " <td>1268.641439</td>\n",
498
+ " <td>1.0</td>\n",
499
+ " <td>130.74</td>\n",
500
+ " <td>18.0</td>\n",
501
+ " <td>1.0</td>\n",
502
+ " <td>0.0</td>\n",
503
+ " </tr>\n",
504
+ " <tr>\n",
505
+ " <th>3</th>\n",
506
+ " <td>Bivalirudin</td>\n",
507
+ " <td>solid</td>\n",
508
+ " <td>B01AE</td>\n",
509
+ " <td>B01A</td>\n",
510
+ " <td>B01</td>\n",
511
+ " <td>B</td>\n",
512
+ " <td>2180.2853</td>\n",
513
+ " <td>-14.0</td>\n",
514
+ " <td>0.0464</td>\n",
515
+ " <td>-4.7</td>\n",
516
+ " <td>...</td>\n",
517
+ " <td>50.0</td>\n",
518
+ " <td>519.89</td>\n",
519
+ " <td>NaN</td>\n",
520
+ " <td>0.0</td>\n",
521
+ " <td>1810.033419</td>\n",
522
+ " <td>1.0</td>\n",
523
+ " <td>194.73</td>\n",
524
+ " <td>16.0</td>\n",
525
+ " <td>0.0</td>\n",
526
+ " <td>0.0</td>\n",
527
+ " </tr>\n",
528
+ " <tr>\n",
529
+ " <th>4</th>\n",
530
+ " <td>Bivalirudin</td>\n",
531
+ " <td>solid</td>\n",
532
+ " <td>B01AE</td>\n",
533
+ " <td>B01A</td>\n",
534
+ " <td>B01</td>\n",
535
+ " <td>B</td>\n",
536
+ " <td>2180.2853</td>\n",
537
+ " <td>-14.0</td>\n",
538
+ " <td>0.0464</td>\n",
539
+ " <td>-4.7</td>\n",
540
+ " <td>...</td>\n",
541
+ " <td>19.0</td>\n",
542
+ " <td>435.41</td>\n",
543
+ " <td>11.77</td>\n",
544
+ " <td>0.0</td>\n",
545
+ " <td>1068.426956</td>\n",
546
+ " <td>1.0</td>\n",
547
+ " <td>104.78</td>\n",
548
+ " <td>15.0</td>\n",
549
+ " <td>1.0</td>\n",
550
+ " <td>0.0</td>\n",
551
+ " </tr>\n",
552
+ " <tr>\n",
553
+ " <th>...</th>\n",
554
+ " <td>...</td>\n",
555
+ " <td>...</td>\n",
556
+ " <td>...</td>\n",
557
+ " <td>...</td>\n",
558
+ " <td>...</td>\n",
559
+ " <td>...</td>\n",
560
+ " <td>...</td>\n",
561
+ " <td>...</td>\n",
562
+ " <td>...</td>\n",
563
+ " <td>...</td>\n",
564
+ " <td>...</td>\n",
565
+ " <td>...</td>\n",
566
+ " <td>...</td>\n",
567
+ " <td>...</td>\n",
568
+ " <td>...</td>\n",
569
+ " <td>...</td>\n",
570
+ " <td>...</td>\n",
571
+ " <td>...</td>\n",
572
+ " <td>...</td>\n",
573
+ " <td>...</td>\n",
574
+ " <td>...</td>\n",
575
+ " </tr>\n",
576
+ " <tr>\n",
577
+ " <th>6916895</th>\n",
578
+ " <td>Methionine C-11</td>\n",
579
+ " <td>NaN</td>\n",
580
+ " <td>V09IX</td>\n",
581
+ " <td>V09I</td>\n",
582
+ " <td>V09</td>\n",
583
+ " <td>V</td>\n",
584
+ " <td>148.2100</td>\n",
585
+ " <td>-2.2</td>\n",
586
+ " <td>23.9000</td>\n",
587
+ " <td>-0.8</td>\n",
588
+ " <td>...</td>\n",
589
+ " <td>7.0</td>\n",
590
+ " <td>104.82</td>\n",
591
+ " <td>4.11</td>\n",
592
+ " <td>0.0</td>\n",
593
+ " <td>452.196074</td>\n",
594
+ " <td>1.0</td>\n",
595
+ " <td>49.55</td>\n",
596
+ " <td>6.0</td>\n",
597
+ " <td>0.0</td>\n",
598
+ " <td>1.0</td>\n",
599
+ " </tr>\n",
600
+ " <tr>\n",
601
+ " <th>6916896</th>\n",
602
+ " <td>Methionine C-11</td>\n",
603
+ " <td>NaN</td>\n",
604
+ " <td>V09IX</td>\n",
605
+ " <td>V09I</td>\n",
606
+ " <td>V09</td>\n",
607
+ " <td>V</td>\n",
608
+ " <td>148.2100</td>\n",
609
+ " <td>-2.2</td>\n",
610
+ " <td>23.9000</td>\n",
611
+ " <td>-0.8</td>\n",
612
+ " <td>...</td>\n",
613
+ " <td>9.0</td>\n",
614
+ " <td>108.74</td>\n",
615
+ " <td>6.27</td>\n",
616
+ " <td>0.0</td>\n",
617
+ " <td>497.165428</td>\n",
618
+ " <td>1.0</td>\n",
619
+ " <td>53.39</td>\n",
620
+ " <td>6.0</td>\n",
621
+ " <td>0.0</td>\n",
622
+ " <td>1.0</td>\n",
623
+ " </tr>\n",
624
+ " <tr>\n",
625
+ " <th>6916897</th>\n",
626
+ " <td>Methionine C-11</td>\n",
627
+ " <td>NaN</td>\n",
628
+ " <td>V09IX</td>\n",
629
+ " <td>V09I</td>\n",
630
+ " <td>V09</td>\n",
631
+ " <td>V</td>\n",
632
+ " <td>148.2100</td>\n",
633
+ " <td>-2.2</td>\n",
634
+ " <td>23.9000</td>\n",
635
+ " <td>-0.8</td>\n",
636
+ " <td>...</td>\n",
637
+ " <td>3.0</td>\n",
638
+ " <td>99.76</td>\n",
639
+ " <td>9.80</td>\n",
640
+ " <td>1.0</td>\n",
641
+ " <td>404.109625</td>\n",
642
+ " <td>0.0</td>\n",
643
+ " <td>37.18</td>\n",
644
+ " <td>7.0</td>\n",
645
+ " <td>0.0</td>\n",
646
+ " <td>1.0</td>\n",
647
+ " </tr>\n",
648
+ " <tr>\n",
649
+ " <th>6916898</th>\n",
650
+ " <td>Methionine C-11</td>\n",
651
+ " <td>NaN</td>\n",
652
+ " <td>V09IX</td>\n",
653
+ " <td>V09I</td>\n",
654
+ " <td>V09</td>\n",
655
+ " <td>V</td>\n",
656
+ " <td>148.2100</td>\n",
657
+ " <td>-2.2</td>\n",
658
+ " <td>23.9000</td>\n",
659
+ " <td>-0.8</td>\n",
660
+ " <td>...</td>\n",
661
+ " <td>6.0</td>\n",
662
+ " <td>114.40</td>\n",
663
+ " <td>-3.50</td>\n",
664
+ " <td>0.0</td>\n",
665
+ " <td>508.055206</td>\n",
666
+ " <td>1.0</td>\n",
667
+ " <td>45.39</td>\n",
668
+ " <td>7.0</td>\n",
669
+ " <td>-1.0</td>\n",
670
+ " <td>0.0</td>\n",
671
+ " </tr>\n",
672
+ " <tr>\n",
673
+ " <th>6916899</th>\n",
674
+ " <td>Methionine C-11</td>\n",
675
+ " <td>NaN</td>\n",
676
+ " <td>V09IX</td>\n",
677
+ " <td>V09I</td>\n",
678
+ " <td>V09</td>\n",
679
+ " <td>V</td>\n",
680
+ " <td>148.2100</td>\n",
681
+ " <td>-2.2</td>\n",
682
+ " <td>23.9000</td>\n",
683
+ " <td>-0.8</td>\n",
684
+ " <td>...</td>\n",
685
+ " <td>4.0</td>\n",
686
+ " <td>63.32</td>\n",
687
+ " <td>9.50</td>\n",
688
+ " <td>0.0</td>\n",
689
+ " <td>148.062484</td>\n",
690
+ " <td>0.0</td>\n",
691
+ " <td>15.54</td>\n",
692
+ " <td>3.0</td>\n",
693
+ " <td>0.0</td>\n",
694
+ " <td>1.0</td>\n",
695
+ " </tr>\n",
696
+ " </tbody>\n",
697
+ "</table>\n",
698
+ "<p>6916900 rows × 50 columns</p>\n",
699
+ "</div>"
700
+ ],
701
+ "text/plain": [
702
+ " name_x state_x level4_x level3_x level2_x level1_x \\\n",
703
+ "0 Bivalirudin solid B01AE B01A B01 B \n",
704
+ "1 Bivalirudin solid B01AE B01A B01 B \n",
705
+ "2 Bivalirudin solid B01AE B01A B01 B \n",
706
+ "3 Bivalirudin solid B01AE B01A B01 B \n",
707
+ "4 Bivalirudin solid B01AE B01A B01 B \n",
708
+ "... ... ... ... ... ... ... \n",
709
+ "6916895 Methionine C-11 NaN V09IX V09I V09 V \n",
710
+ "6916896 Methionine C-11 NaN V09IX V09I V09 V \n",
711
+ "6916897 Methionine C-11 NaN V09IX V09I V09 V \n",
712
+ "6916898 Methionine C-11 NaN V09IX V09I V09 V \n",
713
+ "6916899 Methionine C-11 NaN V09IX V09I V09 V \n",
714
+ "\n",
715
+ " Molecular Weight_x logP_x Water Solubility_x logS_x ... \\\n",
716
+ "0 2180.2853 -14.0 0.0464 -4.7 ... \n",
717
+ "1 2180.2853 -14.0 0.0464 -4.7 ... \n",
718
+ "2 2180.2853 -14.0 0.0464 -4.7 ... \n",
719
+ "3 2180.2853 -14.0 0.0464 -4.7 ... \n",
720
+ "4 2180.2853 -14.0 0.0464 -4.7 ... \n",
721
+ "... ... ... ... ... ... \n",
722
+ "6916895 148.2100 -2.2 23.9000 -0.8 ... \n",
723
+ "6916896 148.2100 -2.2 23.9000 -0.8 ... \n",
724
+ "6916897 148.2100 -2.2 23.9000 -0.8 ... \n",
725
+ "6916898 148.2100 -2.2 23.9000 -0.8 ... \n",
726
+ "6916899 148.2100 -2.2 23.9000 -0.8 ... \n",
727
+ "\n",
728
+ " Rotatable Bond Count_y Polar Surface Area (PSA)_y \\\n",
729
+ "0 66.0 901.57 \n",
730
+ "1 32.0 429.04 \n",
731
+ "2 33.0 495.89 \n",
732
+ "3 50.0 519.89 \n",
733
+ "4 19.0 435.41 \n",
734
+ "... ... ... \n",
735
+ "6916895 7.0 104.82 \n",
736
+ "6916896 9.0 108.74 \n",
737
+ "6916897 3.0 99.76 \n",
738
+ "6916898 6.0 114.40 \n",
739
+ "6916899 4.0 63.32 \n",
740
+ "\n",
741
+ " pKa (strongest basic)_y Ghose Filter_y Monoisotopic Weight_y \\\n",
742
+ "0 11.88 0.0 2178.985813 \n",
743
+ "1 11.92 0.0 1208.645462 \n",
744
+ "2 10.91 0.0 1268.641439 \n",
745
+ "3 NaN 0.0 1810.033419 \n",
746
+ "4 11.77 0.0 1068.426956 \n",
747
+ "... ... ... ... \n",
748
+ "6916895 4.11 0.0 452.196074 \n",
749
+ "6916896 6.27 0.0 497.165428 \n",
750
+ "6916897 9.80 1.0 404.109625 \n",
751
+ "6916898 -3.50 0.0 508.055206 \n",
752
+ "6916899 9.50 0.0 148.062484 \n",
753
+ "\n",
754
+ " MDDR-Like Rule_y Polarizability_y H Bond Acceptor Count_y \\\n",
755
+ "0 1.0 218.54 37.0 \n",
756
+ "1 1.0 125.24 16.0 \n",
757
+ "2 1.0 130.74 18.0 \n",
758
+ "3 1.0 194.73 16.0 \n",
759
+ "4 1.0 104.78 15.0 \n",
760
+ "... ... ... ... \n",
761
+ "6916895 1.0 49.55 6.0 \n",
762
+ "6916896 1.0 53.39 6.0 \n",
763
+ "6916897 0.0 37.18 7.0 \n",
764
+ "6916898 1.0 45.39 7.0 \n",
765
+ "6916899 0.0 15.54 3.0 \n",
766
+ "\n",
767
+ " Physiological Charge_y Rule of Five_y \n",
768
+ "0 -4.0 0.0 \n",
769
+ "1 1.0 0.0 \n",
770
+ "2 1.0 0.0 \n",
771
+ "3 0.0 0.0 \n",
772
+ "4 1.0 0.0 \n",
773
+ "... ... ... \n",
774
+ "6916895 0.0 1.0 \n",
775
+ "6916896 0.0 1.0 \n",
776
+ "6916897 0.0 1.0 \n",
777
+ "6916898 -1.0 0.0 \n",
778
+ "6916899 0.0 1.0 \n",
779
+ "\n",
780
+ "[6916900 rows x 50 columns]"
781
+ ]
782
+ },
783
+ "execution_count": 8,
784
+ "metadata": {},
785
+ "output_type": "execute_result"
786
+ }
787
+ ],
788
+ "source": [
789
+ "# cross two datasets to get all drug pairs\n",
790
+ "df1 = pd.read_csv('datasets/filtered_dataset.csv', index_col=0)\n",
791
+ "df2 = pd.read_csv('datasets/filtered_dataset.csv', index_col=0)\n",
792
+ "\n",
793
+ "df3 = pd.merge(df1, df2, how='cross')\n",
794
+ "df3"
795
+ ]
796
+ },
797
+ {
798
+ "cell_type": "code",
799
+ "execution_count": 9,
800
+ "metadata": {},
801
+ "outputs": [],
802
+ "source": [
803
+ "df3.to_csv('datasets/drug_pairs.csv')"
804
+ ]
805
+ },
806
+ {
807
+ "cell_type": "code",
808
+ "execution_count": 18,
809
+ "metadata": {},
810
+ "outputs": [
811
+ {
812
+ "name": "stdout",
813
+ "output_type": "stream",
814
+ "text": [
815
+ "(6916900, 50) (2630, 25)\n"
816
+ ]
817
+ }
818
+ ],
819
+ "source": [
820
+ "print(df3.shape, df1.shape)"
821
+ ]
822
+ },
823
+ {
824
+ "cell_type": "code",
825
+ "execution_count": null,
826
+ "metadata": {},
827
+ "outputs": [],
828
+ "source": [
829
+ "from itertools import combinations\n",
830
+ "drug_pairs = list(combinations(df['name'], 2))\n",
831
+ "\n",
832
+ "# Create an empty dataframe to store the pairwise combinations and features\n",
833
+ "col1 = [x+\"_d1\" for x in df.columns[1:]]\n",
834
+ "col2 = [x+\"_d2\" for x in df.columns[1:]]\n",
835
+ "\n",
836
+ "df_pairs = pd.DataFrame(columns=['drug1', 'drug2', *col1, *col2])\n",
837
+ "\n",
838
+ "# Iterate through the drug pairs and populate the dataframe\n",
839
+ "for drug1, drug2 in drug_pairs:\n",
840
+ " features_drug1 = df[df['name'] == drug1][[*(df.columns[1:])]].values.flatten()\n",
841
+ " features_drug2 = df[df['name'] == drug2][[*(df.columns[1:])]].values.flatten()\n",
842
+ " row = pd.DataFrame([[drug1, drug2, *features_drug1, *features_drug2]], columns=df_pairs.columns)\n",
843
+ " df_pairs = df_pairs.append(row, ignore_index=True)\n",
844
+ "\n",
845
+ "# Print the resulting pairwise combinations and features dataframe\n",
846
+ "print(df_pairs)\n"
847
+ ]
848
+ },
849
+ {
850
+ "cell_type": "code",
851
+ "execution_count": 1,
852
+ "metadata": {},
853
+ "outputs": [
854
+ {
855
+ "name": "stderr",
856
+ "output_type": "stream",
857
+ "text": [
858
+ "/home/bprimal/.local/lib/python3.8/site-packages/IPython/core/interactiveshell.py:3442: DtypeWarning: Columns (2) have mixed types.Specify dtype option on import or set low_memory=False.\n",
859
+ " exec(code_obj, self.user_global_ns, self.user_ns)\n"
860
+ ]
861
+ },
862
+ {
863
+ "data": {
864
+ "text/html": [
865
+ "<div>\n",
866
+ "<style scoped>\n",
867
+ " .dataframe tbody tr th:only-of-type {\n",
868
+ " vertical-align: middle;\n",
869
+ " }\n",
870
+ "\n",
871
+ " .dataframe tbody tr th {\n",
872
+ " vertical-align: top;\n",
873
+ " }\n",
874
+ "\n",
875
+ " .dataframe thead th {\n",
876
+ " text-align: right;\n",
877
+ " }\n",
878
+ "</style>\n",
879
+ "<table border=\"1\" class=\"dataframe\">\n",
880
+ " <thead>\n",
881
+ " <tr style=\"text-align: right;\">\n",
882
+ " <th></th>\n",
883
+ " <th>name_x</th>\n",
884
+ " <th>state_x</th>\n",
885
+ " <th>level4_x</th>\n",
886
+ " <th>level3_x</th>\n",
887
+ " <th>level2_x</th>\n",
888
+ " <th>level1_x</th>\n",
889
+ " <th>Molecular Weight_x</th>\n",
890
+ " <th>logP_x</th>\n",
891
+ " <th>Water Solubility_x</th>\n",
892
+ " <th>logS_x</th>\n",
893
+ " <th>...</th>\n",
894
+ " <th>Rotatable Bond Count_y</th>\n",
895
+ " <th>Polar Surface Area (PSA)_y</th>\n",
896
+ " <th>pKa (strongest basic)_y</th>\n",
897
+ " <th>Ghose Filter_y</th>\n",
898
+ " <th>Monoisotopic Weight_y</th>\n",
899
+ " <th>MDDR-Like Rule_y</th>\n",
900
+ " <th>Polarizability_y</th>\n",
901
+ " <th>H Bond Acceptor Count_y</th>\n",
902
+ " <th>Physiological Charge_y</th>\n",
903
+ " <th>Rule of Five_y</th>\n",
904
+ " </tr>\n",
905
+ " </thead>\n",
906
+ " <tbody>\n",
907
+ " <tr>\n",
908
+ " <th>0</th>\n",
909
+ " <td>Bivalirudin</td>\n",
910
+ " <td>solid</td>\n",
911
+ " <td>B01AE</td>\n",
912
+ " <td>B01A</td>\n",
913
+ " <td>B01</td>\n",
914
+ " <td>B</td>\n",
915
+ " <td>2180.2853</td>\n",
916
+ " <td>-14.0</td>\n",
917
+ " <td>0.0464</td>\n",
918
+ " <td>-4.7</td>\n",
919
+ " <td>...</td>\n",
920
+ " <td>66.0</td>\n",
921
+ " <td>901.57</td>\n",
922
+ " <td>11.88</td>\n",
923
+ " <td>0.0</td>\n",
924
+ " <td>2178.985813</td>\n",
925
+ " <td>1.0</td>\n",
926
+ " <td>218.54</td>\n",
927
+ " <td>37.0</td>\n",
928
+ " <td>-4.0</td>\n",
929
+ " <td>0.0</td>\n",
930
+ " </tr>\n",
931
+ " <tr>\n",
932
+ " <th>1</th>\n",
933
+ " <td>Bivalirudin</td>\n",
934
+ " <td>solid</td>\n",
935
+ " <td>B01AE</td>\n",
936
+ " <td>B01A</td>\n",
937
+ " <td>B01</td>\n",
938
+ " <td>B</td>\n",
939
+ " <td>2180.2853</td>\n",
940
+ " <td>-14.0</td>\n",
941
+ " <td>0.0464</td>\n",
942
+ " <td>-4.7</td>\n",
943
+ " <td>...</td>\n",
944
+ " <td>32.0</td>\n",
945
+ " <td>429.04</td>\n",
946
+ " <td>11.92</td>\n",
947
+ " <td>0.0</td>\n",
948
+ " <td>1208.645462</td>\n",
949
+ " <td>1.0</td>\n",
950
+ " <td>125.24</td>\n",
951
+ " <td>16.0</td>\n",
952
+ " <td>1.0</td>\n",
953
+ " <td>0.0</td>\n",
954
+ " </tr>\n",
955
+ " <tr>\n",
956
+ " <th>2</th>\n",
957
+ " <td>Bivalirudin</td>\n",
958
+ " <td>solid</td>\n",
959
+ " <td>B01AE</td>\n",
960
+ " <td>B01A</td>\n",
961
+ " <td>B01</td>\n",
962
+ " <td>B</td>\n",
963
+ " <td>2180.2853</td>\n",
964
+ " <td>-14.0</td>\n",
965
+ " <td>0.0464</td>\n",
966
+ " <td>-4.7</td>\n",
967
+ " <td>...</td>\n",
968
+ " <td>33.0</td>\n",
969
+ " <td>495.89</td>\n",
970
+ " <td>10.91</td>\n",
971
+ " <td>0.0</td>\n",
972
+ " <td>1268.641439</td>\n",
973
+ " <td>1.0</td>\n",
974
+ " <td>130.74</td>\n",
975
+ " <td>18.0</td>\n",
976
+ " <td>1.0</td>\n",
977
+ " <td>0.0</td>\n",
978
+ " </tr>\n",
979
+ " <tr>\n",
980
+ " <th>3</th>\n",
981
+ " <td>Bivalirudin</td>\n",
982
+ " <td>solid</td>\n",
983
+ " <td>B01AE</td>\n",
984
+ " <td>B01A</td>\n",
985
+ " <td>B01</td>\n",
986
+ " <td>B</td>\n",
987
+ " <td>2180.2853</td>\n",
988
+ " <td>-14.0</td>\n",
989
+ " <td>0.0464</td>\n",
990
+ " <td>-4.7</td>\n",
991
+ " <td>...</td>\n",
992
+ " <td>50.0</td>\n",
993
+ " <td>519.89</td>\n",
994
+ " <td>NaN</td>\n",
995
+ " <td>0.0</td>\n",
996
+ " <td>1810.033419</td>\n",
997
+ " <td>1.0</td>\n",
998
+ " <td>194.73</td>\n",
999
+ " <td>16.0</td>\n",
1000
+ " <td>0.0</td>\n",
1001
+ " <td>0.0</td>\n",
1002
+ " </tr>\n",
1003
+ " <tr>\n",
1004
+ " <th>4</th>\n",
1005
+ " <td>Bivalirudin</td>\n",
1006
+ " <td>solid</td>\n",
1007
+ " <td>B01AE</td>\n",
1008
+ " <td>B01A</td>\n",
1009
+ " <td>B01</td>\n",
1010
+ " <td>B</td>\n",
1011
+ " <td>2180.2853</td>\n",
1012
+ " <td>-14.0</td>\n",
1013
+ " <td>0.0464</td>\n",
1014
+ " <td>-4.7</td>\n",
1015
+ " <td>...</td>\n",
1016
+ " <td>19.0</td>\n",
1017
+ " <td>435.41</td>\n",
1018
+ " <td>11.77</td>\n",
1019
+ " <td>0.0</td>\n",
1020
+ " <td>1068.426956</td>\n",
1021
+ " <td>1.0</td>\n",
1022
+ " <td>104.78</td>\n",
1023
+ " <td>15.0</td>\n",
1024
+ " <td>1.0</td>\n",
1025
+ " <td>0.0</td>\n",
1026
+ " </tr>\n",
1027
+ " <tr>\n",
1028
+ " <th>5</th>\n",
1029
+ " <td>Bivalirudin</td>\n",
1030
+ " <td>solid</td>\n",
1031
+ " <td>B01AE</td>\n",
1032
+ " <td>B01A</td>\n",
1033
+ " <td>B01</td>\n",
1034
+ " <td>B</td>\n",
1035
+ " <td>2180.2853</td>\n",
1036
+ " <td>-14.0</td>\n",
1037
+ " <td>0.0464</td>\n",
1038
+ " <td>-4.7</td>\n",
1039
+ " <td>...</td>\n",
1040
+ " <td>38.0</td>\n",
1041
+ " <td>495.67</td>\n",
1042
+ " <td>11.79</td>\n",
1043
+ " <td>0.0</td>\n",
1044
+ " <td>1429.669818</td>\n",
1045
+ " <td>1.0</td>\n",
1046
+ " <td>148.93</td>\n",
1047
+ " <td>18.0</td>\n",
1048
+ " <td>1.0</td>\n",
1049
+ " <td>0.0</td>\n",
1050
+ " </tr>\n",
1051
+ " <tr>\n",
1052
+ " <th>6</th>\n",
1053
+ " <td>Bivalirudin</td>\n",
1054
+ " <td>solid</td>\n",
1055
+ " <td>B01AE</td>\n",
1056
+ " <td>B01A</td>\n",
1057
+ " <td>B01</td>\n",
1058
+ " <td>B</td>\n",
1059
+ " <td>2180.2853</td>\n",
1060
+ " <td>-14.0</td>\n",
1061
+ " <td>0.0464</td>\n",
1062
+ " <td>-4.7</td>\n",
1063
+ " <td>...</td>\n",
1064
+ " <td>35.0</td>\n",
1065
+ " <td>702.02</td>\n",
1066
+ " <td>9.59</td>\n",
1067
+ " <td>0.0</td>\n",
1068
+ " <td>1619.710366</td>\n",
1069
+ " <td>1.0</td>\n",
1070
+ " <td>158.96</td>\n",
1071
+ " <td>27.0</td>\n",
1072
+ " <td>-3.0</td>\n",
1073
+ " <td>0.0</td>\n",
1074
+ " </tr>\n",
1075
+ " <tr>\n",
1076
+ " <th>7</th>\n",
1077
+ " <td>Bivalirudin</td>\n",
1078
+ " <td>solid</td>\n",
1079
+ " <td>B01AE</td>\n",
1080
+ " <td>B01A</td>\n",
1081
+ " <td>B01</td>\n",
1082
+ " <td>B</td>\n",
1083
+ " <td>2180.2853</td>\n",
1084
+ " <td>-14.0</td>\n",
1085
+ " <td>0.0464</td>\n",
1086
+ " <td>-4.7</td>\n",
1087
+ " <td>...</td>\n",
1088
+ " <td>38.0</td>\n",
1089
+ " <td>424.98</td>\n",
1090
+ " <td>10.66</td>\n",
1091
+ " <td>0.0</td>\n",
1092
+ " <td>1414.684072</td>\n",
1093
+ " <td>1.0</td>\n",
1094
+ " <td>149.31</td>\n",
1095
+ " <td>16.0</td>\n",
1096
+ " <td>1.0</td>\n",
1097
+ " <td>0.0</td>\n",
1098
+ " </tr>\n",
1099
+ " <tr>\n",
1100
+ " <th>8</th>\n",
1101
+ " <td>Bivalirudin</td>\n",
1102
+ " <td>solid</td>\n",
1103
+ " <td>B01AE</td>\n",
1104
+ " <td>B01A</td>\n",
1105
+ " <td>B01</td>\n",
1106
+ " <td>B</td>\n",
1107
+ " <td>2180.2853</td>\n",
1108
+ " <td>-14.0</td>\n",
1109
+ " <td>0.0464</td>\n",
1110
+ " <td>-4.7</td>\n",
1111
+ " <td>...</td>\n",
1112
+ " <td>4.0</td>\n",
1113
+ " <td>116.95</td>\n",
1114
+ " <td>4.11</td>\n",
1115
+ " <td>0.0</td>\n",
1116
+ " <td>247.024574</td>\n",
1117
+ " <td>0.0</td>\n",
1118
+ " <td>20.90</td>\n",
1119
+ " <td>6.0</td>\n",
1120
+ " <td>-2.0</td>\n",
1121
+ " <td>1.0</td>\n",
1122
+ " </tr>\n",
1123
+ " <tr>\n",
1124
+ " <th>9</th>\n",
1125
+ " <td>Bivalirudin</td>\n",
1126
+ " <td>solid</td>\n",
1127
+ " <td>B01AE</td>\n",
1128
+ " <td>B01A</td>\n",
1129
+ " <td>B01</td>\n",
1130
+ " <td>B</td>\n",
1131
+ " <td>2180.2853</td>\n",
1132
+ " <td>-14.0</td>\n",
1133
+ " <td>0.0464</td>\n",
1134
+ " <td>-4.7</td>\n",
1135
+ " <td>...</td>\n",
1136
+ " <td>27.0</td>\n",
1137
+ " <td>477.85</td>\n",
1138
+ " <td>8.68</td>\n",
1139
+ " <td>0.0</td>\n",
1140
+ " <td>1354.567405</td>\n",
1141
+ " <td>1.0</td>\n",
1142
+ " <td>138.79</td>\n",
1143
+ " <td>18.0</td>\n",
1144
+ " <td>3.0</td>\n",
1145
+ " <td>0.0</td>\n",
1146
+ " </tr>\n",
1147
+ " </tbody>\n",
1148
+ "</table>\n",
1149
+ "<p>10 rows × 50 columns</p>\n",
1150
+ "</div>"
1151
+ ],
1152
+ "text/plain": [
1153
+ " name_x state_x level4_x level3_x level2_x level1_x \\\n",
1154
+ "0 Bivalirudin solid B01AE B01A B01 B \n",
1155
+ "1 Bivalirudin solid B01AE B01A B01 B \n",
1156
+ "2 Bivalirudin solid B01AE B01A B01 B \n",
1157
+ "3 Bivalirudin solid B01AE B01A B01 B \n",
1158
+ "4 Bivalirudin solid B01AE B01A B01 B \n",
1159
+ "5 Bivalirudin solid B01AE B01A B01 B \n",
1160
+ "6 Bivalirudin solid B01AE B01A B01 B \n",
1161
+ "7 Bivalirudin solid B01AE B01A B01 B \n",
1162
+ "8 Bivalirudin solid B01AE B01A B01 B \n",
1163
+ "9 Bivalirudin solid B01AE B01A B01 B \n",
1164
+ "\n",
1165
+ " Molecular Weight_x logP_x Water Solubility_x logS_x ... \\\n",
1166
+ "0 2180.2853 -14.0 0.0464 -4.7 ... \n",
1167
+ "1 2180.2853 -14.0 0.0464 -4.7 ... \n",
1168
+ "2 2180.2853 -14.0 0.0464 -4.7 ... \n",
1169
+ "3 2180.2853 -14.0 0.0464 -4.7 ... \n",
1170
+ "4 2180.2853 -14.0 0.0464 -4.7 ... \n",
1171
+ "5 2180.2853 -14.0 0.0464 -4.7 ... \n",
1172
+ "6 2180.2853 -14.0 0.0464 -4.7 ... \n",
1173
+ "7 2180.2853 -14.0 0.0464 -4.7 ... \n",
1174
+ "8 2180.2853 -14.0 0.0464 -4.7 ... \n",
1175
+ "9 2180.2853 -14.0 0.0464 -4.7 ... \n",
1176
+ "\n",
1177
+ " Rotatable Bond Count_y Polar Surface Area (PSA)_y \\\n",
1178
+ "0 66.0 901.57 \n",
1179
+ "1 32.0 429.04 \n",
1180
+ "2 33.0 495.89 \n",
1181
+ "3 50.0 519.89 \n",
1182
+ "4 19.0 435.41 \n",
1183
+ "5 38.0 495.67 \n",
1184
+ "6 35.0 702.02 \n",
1185
+ "7 38.0 424.98 \n",
1186
+ "8 4.0 116.95 \n",
1187
+ "9 27.0 477.85 \n",
1188
+ "\n",
1189
+ " pKa (strongest basic)_y Ghose Filter_y Monoisotopic Weight_y \\\n",
1190
+ "0 11.88 0.0 2178.985813 \n",
1191
+ "1 11.92 0.0 1208.645462 \n",
1192
+ "2 10.91 0.0 1268.641439 \n",
1193
+ "3 NaN 0.0 1810.033419 \n",
1194
+ "4 11.77 0.0 1068.426956 \n",
1195
+ "5 11.79 0.0 1429.669818 \n",
1196
+ "6 9.59 0.0 1619.710366 \n",
1197
+ "7 10.66 0.0 1414.684072 \n",
1198
+ "8 4.11 0.0 247.024574 \n",
1199
+ "9 8.68 0.0 1354.567405 \n",
1200
+ "\n",
1201
+ " MDDR-Like Rule_y Polarizability_y H Bond Acceptor Count_y \\\n",
1202
+ "0 1.0 218.54 37.0 \n",
1203
+ "1 1.0 125.24 16.0 \n",
1204
+ "2 1.0 130.74 18.0 \n",
1205
+ "3 1.0 194.73 16.0 \n",
1206
+ "4 1.0 104.78 15.0 \n",
1207
+ "5 1.0 148.93 18.0 \n",
1208
+ "6 1.0 158.96 27.0 \n",
1209
+ "7 1.0 149.31 16.0 \n",
1210
+ "8 0.0 20.90 6.0 \n",
1211
+ "9 1.0 138.79 18.0 \n",
1212
+ "\n",
1213
+ " Physiological Charge_y Rule of Five_y \n",
1214
+ "0 -4.0 0.0 \n",
1215
+ "1 1.0 0.0 \n",
1216
+ "2 1.0 0.0 \n",
1217
+ "3 0.0 0.0 \n",
1218
+ "4 1.0 0.0 \n",
1219
+ "5 1.0 0.0 \n",
1220
+ "6 -3.0 0.0 \n",
1221
+ "7 1.0 0.0 \n",
1222
+ "8 -2.0 1.0 \n",
1223
+ "9 3.0 0.0 \n",
1224
+ "\n",
1225
+ "[10 rows x 50 columns]"
1226
+ ]
1227
+ },
1228
+ "execution_count": 1,
1229
+ "metadata": {},
1230
+ "output_type": "execute_result"
1231
+ }
1232
+ ],
1233
+ "source": [
1234
+ "import pandas as pd\n",
1235
+ "\n",
1236
+ "catboost_df = pd.read_csv('datasets/drug_pairs.csv', index_col=0)\n",
1237
+ "catboost_df.head(10)"
1238
+ ]
1239
+ },
1240
+ {
1241
+ "cell_type": "code",
1242
+ "execution_count": 3,
1243
+ "metadata": {},
1244
+ "outputs": [],
1245
+ "source": [
1246
+ "import json\n",
1247
+ "\n",
1248
+ "with open('interactions.json', 'r') as f:\n",
1249
+ " interactions = json.load(f)\n"
1250
+ ]
1251
+ },
1252
+ {
1253
+ "cell_type": "code",
1254
+ "execution_count": 37,
1255
+ "metadata": {},
1256
+ "outputs": [],
1257
+ "source": [
1258
+ "# Create a new column in the dataframe to store the interaction label\n",
1259
+ "# For each drug pair, check if the interaction is present in the interactions dictionary\n",
1260
+ "# If yes, assign 1, else 0\n",
1261
+ "catboost_df['interaction'] = catboost_df.apply(lambda x: 1 if x['name_y'] in interactions.get(x['name_x'], list()) else 0, axis=1)"
1262
+ ]
1263
+ },
1264
+ {
1265
+ "cell_type": "code",
1266
+ "execution_count": 27,
1267
+ "metadata": {},
1268
+ "outputs": [
1269
+ {
1270
+ "data": {
1271
+ "text/html": [
1272
+ "<div>\n",
1273
+ "<style scoped>\n",
1274
+ " .dataframe tbody tr th:only-of-type {\n",
1275
+ " vertical-align: middle;\n",
1276
+ " }\n",
1277
+ "\n",
1278
+ " .dataframe tbody tr th {\n",
1279
+ " vertical-align: top;\n",
1280
+ " }\n",
1281
+ "\n",
1282
+ " .dataframe thead th {\n",
1283
+ " text-align: right;\n",
1284
+ " }\n",
1285
+ "</style>\n",
1286
+ "<table border=\"1\" class=\"dataframe\">\n",
1287
+ " <thead>\n",
1288
+ " <tr style=\"text-align: right;\">\n",
1289
+ " <th></th>\n",
1290
+ " <th>drug1</th>\n",
1291
+ " <th>drug2</th>\n",
1292
+ " <th>interaction</th>\n",
1293
+ " </tr>\n",
1294
+ " </thead>\n",
1295
+ " <tbody>\n",
1296
+ " <tr>\n",
1297
+ " <th>0</th>\n",
1298
+ " <td>hey1</td>\n",
1299
+ " <td>hello1</td>\n",
1300
+ " <td>1</td>\n",
1301
+ " </tr>\n",
1302
+ " <tr>\n",
1303
+ " <th>1</th>\n",
1304
+ " <td>hey2</td>\n",
1305
+ " <td>hello2</td>\n",
1306
+ " <td>1</td>\n",
1307
+ " </tr>\n",
1308
+ " <tr>\n",
1309
+ " <th>2</th>\n",
1310
+ " <td>hey3</td>\n",
1311
+ " <td>hello3</td>\n",
1312
+ " <td>0</td>\n",
1313
+ " </tr>\n",
1314
+ " <tr>\n",
1315
+ " <th>3</th>\n",
1316
+ " <td>hey4</td>\n",
1317
+ " <td>hello4</td>\n",
1318
+ " <td>0</td>\n",
1319
+ " </tr>\n",
1320
+ " </tbody>\n",
1321
+ "</table>\n",
1322
+ "</div>"
1323
+ ],
1324
+ "text/plain": [
1325
+ " drug1 drug2 interaction\n",
1326
+ "0 hey1 hello1 1\n",
1327
+ "1 hey2 hello2 1\n",
1328
+ "2 hey3 hello3 0\n",
1329
+ "3 hey4 hello4 0"
1330
+ ]
1331
+ },
1332
+ "execution_count": 27,
1333
+ "metadata": {},
1334
+ "output_type": "execute_result"
1335
+ }
1336
+ ],
1337
+ "source": [
1338
+ "'''\n",
1339
+ "Dummy example to show how to use the interactions dictionary\n",
1340
+ "'''\n",
1341
+ "\n",
1342
+ "dummy = pd.DataFrame({'drug1': ['hey1', 'hey2', 'hey3', 'hey4'], 'drug2': ['hello1', 'hello2', 'hello3', 'hello4']}, columns=['drug1', 'drug2'])\n",
1343
+ "i = {\n",
1344
+ " 'hey1': ['hello1'],\n",
1345
+ " 'hey2': ['hello1', 'hello2'],\n",
1346
+ " 'hey3': ['hello4'],\n",
1347
+ " 'hey4': [],\n",
1348
+ " 'hey5': ['hello1', 'hello2', 'hello3', 'hello4']\n",
1349
+ "}\n",
1350
+ "dummy['interaction'] = dummy.apply(lambda x: 1 if x['drug2'] in i.get(x['drug1'], list()) else 0, axis=1)\n",
1351
+ "dummy.head()"
1352
+ ]
1353
+ },
1354
+ {
1355
+ "cell_type": "code",
1356
+ "execution_count": 40,
1357
+ "metadata": {},
1358
+ "outputs": [
1359
+ {
1360
+ "data": {
1361
+ "text/plain": [
1362
+ "<AxesSubplot:>"
1363
+ ]
1364
+ },
1365
+ "execution_count": 40,
1366
+ "metadata": {},
1367
+ "output_type": "execute_result"
1368
+ },
1369
+ {
1370
+ "data": {
1371
+ "image/png": "iVBORw0KGgoAAAANSUhEUgAAAhYAAAGsCAYAAACB/u5dAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/MnkTPAAAACXBIWXMAAA9hAAAPYQGoP6dpAAAcw0lEQVR4nO3de5CVdf3A8c+yLAfRXRU3FHLxQoMWgjfS8PITTSJ1SGsqJ4rIvFSiUzJdNDOWvJGjjo2ZmZrkTEjZhJUiQhYymk5KMOElFfGaoqK5C2wdDrvP749mN3EBOev3POtZX68Z/tiHZ5/z4cPieXvO2T01WZZlAQCQQL/eHgAA6DuEBQCQjLAAAJIRFgBAMsICAEhGWAAAyQgLACAZYQEAJCMsAIBkhAUAkEyvhcWSJUti0qRJMWzYsKipqYnbbrut7GtkWRaXX355jBw5MgqFQrz//e+Piy++OP2wAMA26d9bN7x+/frYf//948tf/nJ86lOf6tE1vv71r8fChQvj8ssvj9GjR8frr78er7/+euJJAYBtVfNueBOympqamDdvXpx00kldx4rFYpx//vlxyy23xBtvvBH77bdf/PCHP4zx48dHRMRjjz0WY8aMiYcffjj22Wef3hkcANjEu/Y1FmeddVbcf//9MXfu3Pj73/8en/nMZ+LjH/94PPnkkxER8Yc//CH23nvvuP3222OvvfaKPffcM0477TSPWABAL3pXhsVzzz0XN910U9x6661x5JFHxogRI+Kb3/xmHHHEEXHTTTdFRMSqVavi2WefjVtvvTVuvvnmmD17dixdujQ+/elP9/L0APDe1WuvsdiaFStWRHt7e4wcOXKT48ViMXbZZZeIiOjo6IhisRg333xz13k33nhjHHzwwfH44497egQAesG7MizWrVsXtbW1sXTp0qitrd3k93bYYYeIiBg6dGj0799/k/j44Ac/GBH/fcRDWABA/t6VYXHggQdGe3t7vPLKK3HkkUdu9pzDDz88Nm7cGE899VSMGDEiIiKeeOKJiIjYY489cpsVAPifXvuukHXr1sXKlSsj4r8hceWVV8bRRx8dgwcPjuHDh8cXvvCFuO++++KKK66IAw88MF599dW4++67Y8yYMXHCCSdER0dHfPjDH44ddtghrrrqqujo6Ihp06ZFQ0NDLFy4sDf+SADwntdrYbF48eI4+uijux2fOnVqzJ49O0qlUlx00UVx8803xz//+c9obGyMj3zkIzFz5swYPXp0RES8+OKLcfbZZ8fChQtj++23j+OOOy6uuOKKGDx4cN5/HAAg3iU/xwIA6Bveld9uCgBUJ2EBACST+3eFdHR0xIsvvhj19fVRU1OT980DAD2QZVmsXbs2hg0bFv36bflxidzD4sUXX4ympqa8bxYASOD555+P3XfffYu/n3tY1NfXR8R/B2toaEh23VKpFAsXLoyPfexjUVdXl+y6bMqe82PX+bDnfNhzPiq559bW1mhqauq6H9+S3MOi8+mPhoaG5GExaNCgaGho8EVbQfacH7vOhz3nw57zkcee3+5lDF68CQAkIywAgGSEBQCQjLAAAJIRFgBAMsICAEhGWAAAyQgLACAZYQEAJCMsAIBkhAUAkIywAACSERYAQDLCAgBIJve3Ta+0/ZrvimL71t/S9d3kmVkn9PYIAJCMRywAgGSEBQCQjLAAAJIRFgBAMsICAEhGWAAAyQgLACAZYQEAJCMsAIBkhAUAkIywAACSERYAQDLCAgBIRlgAAMkICwAgGWEBACQjLACAZIQFAJCMsAAAkhEWAEAywgIASEZYAADJCAsAIBlhAQAkIywAgGSEBQCQjLAAAJIRFgBAMsICAEhGWAAAyQgLACCZssKiubk5ampqNvm17777Vmo2AKDK9C/3E0aNGhV//OMf/3eB/mVfAgDoo8qugv79+8duu+1WiVkAgCpXdlg8+eSTMWzYsBg4cGCMGzcuLr300hg+fPgWzy8Wi1EsFrs+bm1tjYiIUqkUpVKpByNvXue1Cv2yZNfMQ8od5KFz3mqbuxrZdT7sOR/2nI9K7nlbr1mTZdk23xPfeeedsW7duthnn33ipZdeipkzZ8Y///nPePjhh6O+vn6zn9Pc3BwzZ87sdnzOnDkxaNCgbb1pAKAXtbW1xeTJk6OlpSUaGhq2eF5ZYfFWb7zxRuyxxx5x5ZVXxqmnnrrZczb3iEVTU1OsWbNmq4OVq1QqxaJFi+KCh/pFsaMm2XUr7eHmib09Qlk69zxhwoSoq6vr7XH6NLvOhz3nw57zUck9t7a2RmNj49uGxTt65eVOO+0UI0eOjJUrV27xnEKhEIVCodvxurq6inxxFTtqothePWFRrf/AKvX3R3d2nQ97zoc956MSe97W672jn2Oxbt26eOqpp2Lo0KHv5DIAQB9RVlh885vfjHvuuSeeeeaZ+Mtf/hKf/OQno7a2Nj73uc9Vaj4AoIqU9VTICy+8EJ/73Ofitddei/e9731xxBFHxAMPPBDve9/7KjUfAFBFygqLuXPnVmoOAKAP8F4hAEAywgIASEZYAADJCAsAIBlhAQAkIywAgGSEBQCQjLAAAJIRFgBAMsICAEhGWAAAyQgLACAZYQEAJCMsAIBkhAUAkIywAACSERYAQDLCAgBIRlgAAMkICwAgGWEBACQjLACAZIQFAJCMsAAAkhEWAEAywgIASEZYAADJCAsAIBlhAQAkIywAgGSEBQCQjLAAAJIRFgBAMsICAEhGWAAAyQgLACAZYQEAJCMsAIBkhAUAkIywAACSERYAQDLCAgBIRlgAAMkICwAgGWEBACQjLACAZIQFAJCMsAAAkhEWAEAywgIASEZYAADJCAsAIJl3FBazZs2Kmpqa+MY3vpFoHACgmvU4LB588MG47rrrYsyYMSnnAQCqWI/CYt26dfH5z38+rr/++th5551TzwQAVKn+PfmkadOmxQknnBDHHntsXHTRRVs9t1gsRrFY7Pq4tbU1IiJKpVKUSqWe3PxmdV6r0C9Lds08pNxBHjrnrba5q5Fd58Oe82HP+ajknrf1mjVZlpV1Tzx37ty4+OKL48EHH4yBAwfG+PHj44ADDoirrrpqs+c3NzfHzJkzux2fM2dODBo0qJybBgB6SVtbW0yePDlaWlqioaFhi+eVFRbPP/98jB07NhYtWtT12oq3C4vNPWLR1NQUa9as2epg5SqVSrFo0aK44KF+UeyoSXbdSnu4eWJvj1CWzj1PmDAh6urqenucPs2u82HP+bDnfFRyz62trdHY2Pi2YVHWUyFLly6NV155JQ466KCuY+3t7bFkyZL48Y9/HMViMWprazf5nEKhEIVCodu16urqKvLFVeyoiWJ79YRFtf4Dq9TfH93ZdT7sOR/2nI9K7Hlbr1dWWHz0ox+NFStWbHLslFNOiX333Te+853vdIsKAOC9paywqK+vj/3222+TY9tvv33ssssu3Y4DAO89fvImAJBMj77d9M0WL16cYAwAoC/wiAUAkIywAACSERYAQDLCAgBIRlgAAMkICwAgGWEBACQjLACAZIQFAJCMsAAAkhEWAEAywgIASEZYAADJCAsAIBlhAQAkIywAgGSEBQCQjLAAAJIRFgBAMsICAEhGWAAAyQgLACAZYQEAJCMsAIBkhAUAkIywAACSERYAQDLCAgBIRlgAAMkICwAgGWEBACQjLACAZIQFAJCMsAAAkhEWAEAywgIASEZYAADJCAsAIBlhAQAkIywAgGSEBQCQjLAAAJIRFgBAMsICAEhGWAAAyQgLACAZYQEAJCMsAIBkhAUAkIywAACSERYAQDLCAgBIRlgAAMmUFRbXXnttjBkzJhoaGqKhoSHGjRsXd955Z6VmAwCqTFlhsfvuu8esWbNi6dKl8dBDD8UxxxwTJ554YjzyyCOVmg8AqCL9yzl50qRJm3x88cUXx7XXXhsPPPBAjBo1KulgAED1KSss3qy9vT1uvfXWWL9+fYwbN26L5xWLxSgWi10ft7a2RkREqVSKUqnU05vvpvNahX5ZsmvmIeUO8tA5b7XNXY3sOh/2nA97zkcl97yt16zJsqyse+IVK1bEuHHj4j//+U/ssMMOMWfOnDj++OO3eH5zc3PMnDmz2/E5c+bEoEGDyrlpAKCXtLW1xeTJk6OlpSUaGhq2eF7ZYbFhw4Z47rnnoqWlJX7zm9/EDTfcEPfcc0986EMf2uz5m3vEoqmpKdasWbPVwcpVKpVi0aJFccFD/aLYUZPsupX2cPPE3h6hLJ17njBhQtTV1fX2OH2aXefDnvNhz/mo5J5bW1ujsbHxbcOi7KdCBgwYEB/4wAciIuLggw+OBx98MH70ox/Fddddt9nzC4VCFAqFbsfr6uoq8sVV7KiJYnv1hEW1/gOr1N8f3dl1Puw5H/acj0rseVuv945/jkVHR8cmj0gAAO9dZT1icd5558Vxxx0Xw4cPj7Vr18acOXNi8eLFcdddd1VqPgCgipQVFq+88kp88YtfjJdeeil23HHHGDNmTNx1110xYcKESs0HAFSRssLixhtvrNQcAEAf4L1CAIBkhAUAkIywAACSERYAQDLCAgBIRlgAAMkICwAgGWEBACQjLACAZIQFAJCMsAAAkhEWAEAywgIASEZYAADJCAsAIBlhAQAkIywAgGSEBQCQjLAAAJIRFgBAMsICAEhGWAAAyQgLACAZYQEAJCMsAIBkhAUAkIywAACSERYAQDLCAgBIRlgAAMkICwAgGWEBACQjLACAZIQFAJCMsAAAkhEWAEAywgIASEZYAADJCAsAIBlhAQAkIywAgGSEBQCQjLAAAJIRFgBAMsICAEhGWAAAyQgLACAZYQEAJCMsAIBkhAUAkIywAACSERYAQDJlhcWll14aH/7wh6O+vj6GDBkSJ510Ujz++OOVmg0AqDJlhcU999wT06ZNiwceeCAWLVoUpVIpPvaxj8X69esrNR8AUEX6l3PyggULNvl49uzZMWTIkFi6dGn83//9X9LBAIDqU1ZYvFVLS0tERAwePHiL5xSLxSgWi10ft7a2RkREqVSKUqn0Tm5+E53XKvTLkl0zDyl3kIfOeatt7mpk1/mw53zYcz4quedtvWZNlmU9uifu6OiIT3ziE/HGG2/Evffeu8XzmpubY+bMmd2Oz5kzJwYNGtSTmwYActbW1haTJ0+OlpaWaGho2OJ5PQ6Lr33ta3HnnXfGvffeG7vvvvsWz9vcIxZNTU2xZs2arQ5WrlKpFIsWLYoLHuoXxY6aZNettIebJ/b2CGXp3POECROirq6ut8fp0+w6H/acD3vORyX33NraGo2NjW8bFj16KuSss86K22+/PZYsWbLVqIiIKBQKUSgUuh2vq6uryBdXsaMmiu3VExbV+g+sUn9/dGfX+bDnfNhzPiqx5229XllhkWVZnH322TFv3rxYvHhx7LXXXj0aDgDom8oKi2nTpsWcOXPid7/7XdTX18fq1asjImLHHXeM7bbbriIDAgDVo6yfY3HttddGS0tLjB8/PoYOHdr161e/+lWl5gMAqkjZT4UAAGyJ9woBAJIRFgBAMsICAEhGWAAAyQgLACAZYQEAJCMsAIBkhAUAkIywAACSERYAQDLCAgBIRlgAAMkICwAgGWEBACQjLACAZIQFAJCMsAAAkhEWAEAywgIASEZYAADJCAsAIBlhAQAkIywAgGSEBQCQjLAAAJIRFgBAMsICAEhGWAAAyQgLACAZYQEAJCMsAIBkhAUAkIywAACSERYAQDL9e3sAAHi32vPcO3p7hLIUarO47JDencEjFgBAMsICAEhGWAAAyQgLACAZYQEAJCMsAIBkhAUAkIywAACSERYAQDLCAgBIRlgAAMkICwAgGWEBACQjLACAZIQFAJCMsAAAkhEWAEAyZYfFkiVLYtKkSTFs2LCoqamJ2267rQJjAQDVqOywWL9+fey///5xzTXXVGIeAKCK9S/3E4477rg47rjjKjELAFDlyg6LchWLxSgWi10ft7a2RkREqVSKUqmU7HY6r1XolyW7Zh5S7iAPnfNW29zVyK7zYc/5qNY9F2qr6z6l8z6wEnve1mvWZFnW463V1NTEvHnz4qSTTtriOc3NzTFz5sxux+fMmRODBg3q6U0DADlqa2uLyZMnR0tLSzQ0NGzxvIqHxeYesWhqaoo1a9ZsdbBylUqlWLRoUVzwUL8odtQku26lPdw8sbdHKEvnnidMmBB1dXW9PU6fZtf5sOd8VOue92u+q7dHKEuhXxYXju2oyJ5bW1ujsbHxbcOi4k+FFAqFKBQK3Y7X1dVV5Iur2FETxfbqCYtq+gf2ZpX6+6M7u86HPeej2vZcTfcnb1aJPW/r9fwcCwAgmbIfsVi3bl2sXLmy6+Onn346li9fHoMHD47hw4cnHQ4AqC5lh8VDDz0URx99dNfH06dPj4iIqVOnxuzZs5MNBgBUn7LDYvz48fEOXu8JAPRhXmMBACQjLACAZIQFAJCMsAAAkhEWAEAywgIASEZYAADJCAsAIBlhAQAkIywAgGSEBQCQjLAAAJIRFgBAMsICAEhGWAAAyQgLACAZYQEAJCMsAIBkhAUAkIywAACSERYAQDLCAgBIRlgAAMkICwAgGWEBACQjLACAZIQFAJCMsAAAkhEWAEAywgIASEZYAADJCAsAIBlhAQAkIywAgGSEBQCQjLAAAJIRFgBAMsICAEhGWAAAyQgLACAZYQEAJCMsAIBkhAUAkIywAACSERYAQDLCAgBIRlgAAMkICwAgGWEBACQjLACAZIQFAJCMsAAAkulRWFxzzTWx5557xsCBA+PQQw+Nv/71r6nnAgCqUNlh8atf/SqmT58eM2bMiL/97W+x//77x8SJE+OVV16pxHwAQBUpOyyuvPLKOP300+OUU06JD33oQ/HTn/40Bg0aFD//+c8rMR8AUEX6l3Pyhg0bYunSpXHeeed1HevXr18ce+yxcf/992/2c4rFYhSLxa6PW1paIiLi9ddfj1Kp1JOZN6tUKkVbW1v0L/WL9o6aZNettNdee623RyhL555fe+21qKur6+1x+jS7zoc956Na99x/4/reHqEs/TuyaGvrqMie165dGxERWZZtfYZyLrpmzZpob2+PXXfddZPju+66a/zjH//Y7OdceumlMXPmzG7H99prr3Juus9qvKK3JwCgL5lc4euvXbs2dtxxxy3+fllh0RPnnXdeTJ8+vevjjo6OeP3112OXXXaJmpp0jyy0trZGU1NTPP/889HQ0JDsumzKnvNj1/mw53zYcz4quecsy2Lt2rUxbNiwrZ5XVlg0NjZGbW1tvPzyy5scf/nll2O33Xbb7OcUCoUoFAqbHNtpp53KudmyNDQ0+KLNgT3nx67zYc/5sOd8VGrPW3ukolNZL94cMGBAHHzwwXH33Xd3Hevo6Ii77747xo0bV/6EAECfUvZTIdOnT4+pU6fG2LFj45BDDomrrroq1q9fH6ecckol5gMAqkjZYXHyySfHq6++Gt///vdj9erVccABB8SCBQu6vaAzb4VCIWbMmNHtaRfSsuf82HU+7Dkf9pyPd8Oea7K3+74RAIBt5L1CAIBkhAUAkIywAACSERYAQDJVFRblvl37rbfeGvvuu28MHDgwRo8eHfPnz89p0upWzp6vv/76OPLII2PnnXeOnXfeOY499ti3/Xvhv8r9eu40d+7cqKmpiZNOOqmyA/Yh5e76jTfeiGnTpsXQoUOjUCjEyJEj/fdjG5S756uuuir22Wef2G677aKpqSnOOeec+M9//pPTtNVpyZIlMWnSpBg2bFjU1NTEbbfd9rafs3jx4jjooIOiUCjEBz7wgZg9e3Zlh8yqxNy5c7MBAwZkP//5z7NHHnkkO/3007Oddtope/nllzd7/n333ZfV1tZml112Wfboo49m3/ve97K6urpsxYoVOU9eXcrd8+TJk7NrrrkmW7ZsWfbYY49lX/rSl7Idd9wxe+GFF3KevLqUu+dOTz/9dPb+978/O/LII7MTTzwxn2GrXLm7LhaL2dixY7Pjjz8+u/fee7Onn346W7x4cbZ8+fKcJ68u5e75l7/8ZVYoFLJf/vKX2dNPP53ddddd2dChQ7Nzzjkn58mry/z587Pzzz8/++1vf5tFRDZv3rytnr9q1aps0KBB2fTp07NHH300u/rqq7Pa2tpswYIFFZuxasLikEMOyaZNm9b1cXt7ezZs2LDs0ksv3ez5n/3sZ7MTTjhhk2OHHnpo9pWvfKWic1a7cvf8Vhs3bszq6+uzX/ziF5UasU/oyZ43btyYHXbYYdkNN9yQTZ06VVhso3J3fe2112Z77713tmHDhrxG7BPK3fO0adOyY445ZpNj06dPzw4//PCKztmXbEtYfPvb385GjRq1ybGTTz45mzhxYsXmqoqnQjrfrv3YY4/tOvZ2b9d+//33b3J+RMTEiRO3eD492/NbtbW1RalUisGDB1dqzKrX0z3/4Ac/iCFDhsSpp56ax5h9Qk92/fvf/z7GjRsX06ZNi1133TX222+/uOSSS6K9vT2vsatOT/Z82GGHxdKlS7ueLlm1alXMnz8/jj/++Fxmfq/ojfvCir+7aQo9ebv21atXb/b81atXV2zOateTPb/Vd77znRg2bFi3L2T+pyd7vvfee+PGG2+M5cuX5zBh39GTXa9atSr+9Kc/xec///mYP39+rFy5Ms4888wolUoxY8aMPMauOj3Z8+TJk2PNmjVxxBFHRJZlsXHjxvjqV78a3/3ud/MY+T1jS/eFra2t8e9//zu222675LdZFY9YUB1mzZoVc+fOjXnz5sXAgQN7e5w+Y+3atTFlypS4/vrro7GxsbfH6fM6OjpiyJAh8bOf/SwOPvjgOPnkk+P888+Pn/70p709Wp+yePHiuOSSS+InP/lJ/O1vf4vf/va3cccdd8SFF17Y26PxDlXFIxY9ebv23Xbbrazz6dmeO11++eUxa9as+OMf/xhjxoyp5JhVr9w9P/XUU/HMM8/EpEmTuo51dHRERET//v3j8ccfjxEjRlR26CrVk6/poUOHRl1dXdTW1nYd++AHPxirV6+ODRs2xIABAyo6czXqyZ4vuOCCmDJlSpx22mkRETF69OhYv359nHHGGXH++edHv37+vzeFLd0XNjQ0VOTRiogqecSiJ2/XPm7cuE3Oj4hYtGiRt3ffip7sOSLisssuiwsvvDAWLFgQY8eOzWPUqlbunvfdd99YsWJFLF++vOvXJz7xiTj66KNj+fLl0dTUlOf4VaUnX9OHH354rFy5siveIiKeeOKJGDp0qKjYgp7sua2trVs8dMZc5i2skumV+8KKvSw0sblz52aFQiGbPXt29uijj2ZnnHFGttNOO2WrV6/OsizLpkyZkp177rld5993331Z//79s8svvzx77LHHshkzZvh2021Q7p5nzZqVDRgwIPvNb36TvfTSS12/1q5d21t/hKpQ7p7fyneFbLtyd/3cc89l9fX12VlnnZU9/vjj2e23354NGTIku+iii3rrj1AVyt3zjBkzsvr6+uyWW27JVq1alS1cuDAbMWJE9tnPfra3/ghVYe3atdmyZcuyZcuWZRGRXXnlldmyZcuyZ599NsuyLDv33HOzKVOmdJ3f+e2m3/rWt7LHHnssu+aaa3y76ZtdffXV2fDhw7MBAwZkhxxySPbAAw90/d5RRx2VTZ06dZPzf/3rX2cjR47MBgwYkI0aNSq74447cp64OpWz5z322COLiG6/ZsyYkf/gVabcr+c3ExblKXfXf/nLX7JDDz00KxQK2d57751dfPHF2caNG3OeuvqUs+dSqZQ1NzdnI0aMyAYOHJg1NTVlZ555Zvavf/0r/8GryJ///OfN/je3c7dTp07NjjrqqG6fc8ABB2QDBgzI9t577+ymm26q6IzeNh0ASKYqXmMBAFQHYQEAJCMsAIBkhAUAkIywAACSERYAQDLCAgBIRlgAAMkICwAgGWEBACQjLACAZIQFAJDM/wPtyEe9ddUmcAAAAABJRU5ErkJggg==",
1372
+ "text/plain": [
1373
+ "<Figure size 640x480 with 1 Axes>"
1374
+ ]
1375
+ },
1376
+ "metadata": {},
1377
+ "output_type": "display_data"
1378
+ }
1379
+ ],
1380
+ "source": [
1381
+ "catboost_df[\"interaction\"].hist()\n"
1382
+ ]
1383
+ },
1384
+ {
1385
+ "cell_type": "code",
1386
+ "execution_count": 45,
1387
+ "metadata": {},
1388
+ "outputs": [
1389
+ {
1390
+ "data": {
1391
+ "text/plain": [
1392
+ "0 5565768\n",
1393
+ "1 1351132\n",
1394
+ "Name: interaction, dtype: int64"
1395
+ ]
1396
+ },
1397
+ "execution_count": 45,
1398
+ "metadata": {},
1399
+ "output_type": "execute_result"
1400
+ }
1401
+ ],
1402
+ "source": [
1403
+ "catboost_df['interaction'].value_counts()"
1404
+ ]
1405
+ },
1406
+ {
1407
+ "cell_type": "code",
1408
+ "execution_count": 46,
1409
+ "metadata": {},
1410
+ "outputs": [],
1411
+ "source": [
1412
+ "catboost_df.to_csv('datasets/catboost_df.csv')"
1413
+ ]
1414
+ },
1415
+ {
1416
+ "cell_type": "code",
1417
+ "execution_count": 49,
1418
+ "metadata": {},
1419
+ "outputs": [
1420
+ {
1421
+ "data": {
1422
+ "text/html": [
1423
+ "<div>\n",
1424
+ "<style scoped>\n",
1425
+ " .dataframe tbody tr th:only-of-type {\n",
1426
+ " vertical-align: middle;\n",
1427
+ " }\n",
1428
+ "\n",
1429
+ " .dataframe tbody tr th {\n",
1430
+ " vertical-align: top;\n",
1431
+ " }\n",
1432
+ "\n",
1433
+ " .dataframe thead th {\n",
1434
+ " text-align: right;\n",
1435
+ " }\n",
1436
+ "</style>\n",
1437
+ "<table border=\"1\" class=\"dataframe\">\n",
1438
+ " <thead>\n",
1439
+ " <tr style=\"text-align: right;\">\n",
1440
+ " <th></th>\n",
1441
+ " <th>state_x</th>\n",
1442
+ " <th>level4_x</th>\n",
1443
+ " <th>level3_x</th>\n",
1444
+ " <th>level2_x</th>\n",
1445
+ " <th>level1_x</th>\n",
1446
+ " <th>Molecular Weight_x</th>\n",
1447
+ " <th>logP_x</th>\n",
1448
+ " <th>Water Solubility_x</th>\n",
1449
+ " <th>logS_x</th>\n",
1450
+ " <th>Bioavailability_x</th>\n",
1451
+ " <th>...</th>\n",
1452
+ " <th>Polar Surface Area (PSA)_y</th>\n",
1453
+ " <th>pKa (strongest basic)_y</th>\n",
1454
+ " <th>Ghose Filter_y</th>\n",
1455
+ " <th>Monoisotopic Weight_y</th>\n",
1456
+ " <th>MDDR-Like Rule_y</th>\n",
1457
+ " <th>Polarizability_y</th>\n",
1458
+ " <th>H Bond Acceptor Count_y</th>\n",
1459
+ " <th>Physiological Charge_y</th>\n",
1460
+ " <th>Rule of Five_y</th>\n",
1461
+ " <th>interaction</th>\n",
1462
+ " </tr>\n",
1463
+ " </thead>\n",
1464
+ " <tbody>\n",
1465
+ " <tr>\n",
1466
+ " <th>0</th>\n",
1467
+ " <td>solid</td>\n",
1468
+ " <td>B01AE</td>\n",
1469
+ " <td>B01A</td>\n",
1470
+ " <td>B01</td>\n",
1471
+ " <td>B</td>\n",
1472
+ " <td>2180.2853</td>\n",
1473
+ " <td>-14.0</td>\n",
1474
+ " <td>0.0464</td>\n",
1475
+ " <td>-4.7</td>\n",
1476
+ " <td>0.0</td>\n",
1477
+ " <td>...</td>\n",
1478
+ " <td>901.57</td>\n",
1479
+ " <td>11.88</td>\n",
1480
+ " <td>0.0</td>\n",
1481
+ " <td>2178.985813</td>\n",
1482
+ " <td>1.0</td>\n",
1483
+ " <td>218.54</td>\n",
1484
+ " <td>37.0</td>\n",
1485
+ " <td>-4.0</td>\n",
1486
+ " <td>0.0</td>\n",
1487
+ " <td>0</td>\n",
1488
+ " </tr>\n",
1489
+ " <tr>\n",
1490
+ " <th>1</th>\n",
1491
+ " <td>solid</td>\n",
1492
+ " <td>B01AE</td>\n",
1493
+ " <td>B01A</td>\n",
1494
+ " <td>B01</td>\n",
1495
+ " <td>B</td>\n",
1496
+ " <td>2180.2853</td>\n",
1497
+ " <td>-14.0</td>\n",
1498
+ " <td>0.0464</td>\n",
1499
+ " <td>-4.7</td>\n",
1500
+ " <td>0.0</td>\n",
1501
+ " <td>...</td>\n",
1502
+ " <td>429.04</td>\n",
1503
+ " <td>11.92</td>\n",
1504
+ " <td>0.0</td>\n",
1505
+ " <td>1208.645462</td>\n",
1506
+ " <td>1.0</td>\n",
1507
+ " <td>125.24</td>\n",
1508
+ " <td>16.0</td>\n",
1509
+ " <td>1.0</td>\n",
1510
+ " <td>0.0</td>\n",
1511
+ " <td>0</td>\n",
1512
+ " </tr>\n",
1513
+ " <tr>\n",
1514
+ " <th>2</th>\n",
1515
+ " <td>solid</td>\n",
1516
+ " <td>B01AE</td>\n",
1517
+ " <td>B01A</td>\n",
1518
+ " <td>B01</td>\n",
1519
+ " <td>B</td>\n",
1520
+ " <td>2180.2853</td>\n",
1521
+ " <td>-14.0</td>\n",
1522
+ " <td>0.0464</td>\n",
1523
+ " <td>-4.7</td>\n",
1524
+ " <td>0.0</td>\n",
1525
+ " <td>...</td>\n",
1526
+ " <td>495.89</td>\n",
1527
+ " <td>10.91</td>\n",
1528
+ " <td>0.0</td>\n",
1529
+ " <td>1268.641439</td>\n",
1530
+ " <td>1.0</td>\n",
1531
+ " <td>130.74</td>\n",
1532
+ " <td>18.0</td>\n",
1533
+ " <td>1.0</td>\n",
1534
+ " <td>0.0</td>\n",
1535
+ " <td>0</td>\n",
1536
+ " </tr>\n",
1537
+ " <tr>\n",
1538
+ " <th>3</th>\n",
1539
+ " <td>solid</td>\n",
1540
+ " <td>B01AE</td>\n",
1541
+ " <td>B01A</td>\n",
1542
+ " <td>B01</td>\n",
1543
+ " <td>B</td>\n",
1544
+ " <td>2180.2853</td>\n",
1545
+ " <td>-14.0</td>\n",
1546
+ " <td>0.0464</td>\n",
1547
+ " <td>-4.7</td>\n",
1548
+ " <td>0.0</td>\n",
1549
+ " <td>...</td>\n",
1550
+ " <td>519.89</td>\n",
1551
+ " <td>NaN</td>\n",
1552
+ " <td>0.0</td>\n",
1553
+ " <td>1810.033419</td>\n",
1554
+ " <td>1.0</td>\n",
1555
+ " <td>194.73</td>\n",
1556
+ " <td>16.0</td>\n",
1557
+ " <td>0.0</td>\n",
1558
+ " <td>0.0</td>\n",
1559
+ " <td>0</td>\n",
1560
+ " </tr>\n",
1561
+ " <tr>\n",
1562
+ " <th>4</th>\n",
1563
+ " <td>solid</td>\n",
1564
+ " <td>B01AE</td>\n",
1565
+ " <td>B01A</td>\n",
1566
+ " <td>B01</td>\n",
1567
+ " <td>B</td>\n",
1568
+ " <td>2180.2853</td>\n",
1569
+ " <td>-14.0</td>\n",
1570
+ " <td>0.0464</td>\n",
1571
+ " <td>-4.7</td>\n",
1572
+ " <td>0.0</td>\n",
1573
+ " <td>...</td>\n",
1574
+ " <td>435.41</td>\n",
1575
+ " <td>11.77</td>\n",
1576
+ " <td>0.0</td>\n",
1577
+ " <td>1068.426956</td>\n",
1578
+ " <td>1.0</td>\n",
1579
+ " <td>104.78</td>\n",
1580
+ " <td>15.0</td>\n",
1581
+ " <td>1.0</td>\n",
1582
+ " <td>0.0</td>\n",
1583
+ " <td>0</td>\n",
1584
+ " </tr>\n",
1585
+ " <tr>\n",
1586
+ " <th>5</th>\n",
1587
+ " <td>solid</td>\n",
1588
+ " <td>B01AE</td>\n",
1589
+ " <td>B01A</td>\n",
1590
+ " <td>B01</td>\n",
1591
+ " <td>B</td>\n",
1592
+ " <td>2180.2853</td>\n",
1593
+ " <td>-14.0</td>\n",
1594
+ " <td>0.0464</td>\n",
1595
+ " <td>-4.7</td>\n",
1596
+ " <td>0.0</td>\n",
1597
+ " <td>...</td>\n",
1598
+ " <td>495.67</td>\n",
1599
+ " <td>11.79</td>\n",
1600
+ " <td>0.0</td>\n",
1601
+ " <td>1429.669818</td>\n",
1602
+ " <td>1.0</td>\n",
1603
+ " <td>148.93</td>\n",
1604
+ " <td>18.0</td>\n",
1605
+ " <td>1.0</td>\n",
1606
+ " <td>0.0</td>\n",
1607
+ " <td>0</td>\n",
1608
+ " </tr>\n",
1609
+ " <tr>\n",
1610
+ " <th>6</th>\n",
1611
+ " <td>solid</td>\n",
1612
+ " <td>B01AE</td>\n",
1613
+ " <td>B01A</td>\n",
1614
+ " <td>B01</td>\n",
1615
+ " <td>B</td>\n",
1616
+ " <td>2180.2853</td>\n",
1617
+ " <td>-14.0</td>\n",
1618
+ " <td>0.0464</td>\n",
1619
+ " <td>-4.7</td>\n",
1620
+ " <td>0.0</td>\n",
1621
+ " <td>...</td>\n",
1622
+ " <td>702.02</td>\n",
1623
+ " <td>9.59</td>\n",
1624
+ " <td>0.0</td>\n",
1625
+ " <td>1619.710366</td>\n",
1626
+ " <td>1.0</td>\n",
1627
+ " <td>158.96</td>\n",
1628
+ " <td>27.0</td>\n",
1629
+ " <td>-3.0</td>\n",
1630
+ " <td>0.0</td>\n",
1631
+ " <td>0</td>\n",
1632
+ " </tr>\n",
1633
+ " <tr>\n",
1634
+ " <th>7</th>\n",
1635
+ " <td>solid</td>\n",
1636
+ " <td>B01AE</td>\n",
1637
+ " <td>B01A</td>\n",
1638
+ " <td>B01</td>\n",
1639
+ " <td>B</td>\n",
1640
+ " <td>2180.2853</td>\n",
1641
+ " <td>-14.0</td>\n",
1642
+ " <td>0.0464</td>\n",
1643
+ " <td>-4.7</td>\n",
1644
+ " <td>0.0</td>\n",
1645
+ " <td>...</td>\n",
1646
+ " <td>424.98</td>\n",
1647
+ " <td>10.66</td>\n",
1648
+ " <td>0.0</td>\n",
1649
+ " <td>1414.684072</td>\n",
1650
+ " <td>1.0</td>\n",
1651
+ " <td>149.31</td>\n",
1652
+ " <td>16.0</td>\n",
1653
+ " <td>1.0</td>\n",
1654
+ " <td>0.0</td>\n",
1655
+ " <td>0</td>\n",
1656
+ " </tr>\n",
1657
+ " <tr>\n",
1658
+ " <th>8</th>\n",
1659
+ " <td>solid</td>\n",
1660
+ " <td>B01AE</td>\n",
1661
+ " <td>B01A</td>\n",
1662
+ " <td>B01</td>\n",
1663
+ " <td>B</td>\n",
1664
+ " <td>2180.2853</td>\n",
1665
+ " <td>-14.0</td>\n",
1666
+ " <td>0.0464</td>\n",
1667
+ " <td>-4.7</td>\n",
1668
+ " <td>0.0</td>\n",
1669
+ " <td>...</td>\n",
1670
+ " <td>116.95</td>\n",
1671
+ " <td>4.11</td>\n",
1672
+ " <td>0.0</td>\n",
1673
+ " <td>247.024574</td>\n",
1674
+ " <td>0.0</td>\n",
1675
+ " <td>20.90</td>\n",
1676
+ " <td>6.0</td>\n",
1677
+ " <td>-2.0</td>\n",
1678
+ " <td>1.0</td>\n",
1679
+ " <td>0</td>\n",
1680
+ " </tr>\n",
1681
+ " <tr>\n",
1682
+ " <th>9</th>\n",
1683
+ " <td>solid</td>\n",
1684
+ " <td>B01AE</td>\n",
1685
+ " <td>B01A</td>\n",
1686
+ " <td>B01</td>\n",
1687
+ " <td>B</td>\n",
1688
+ " <td>2180.2853</td>\n",
1689
+ " <td>-14.0</td>\n",
1690
+ " <td>0.0464</td>\n",
1691
+ " <td>-4.7</td>\n",
1692
+ " <td>0.0</td>\n",
1693
+ " <td>...</td>\n",
1694
+ " <td>477.85</td>\n",
1695
+ " <td>8.68</td>\n",
1696
+ " <td>0.0</td>\n",
1697
+ " <td>1354.567405</td>\n",
1698
+ " <td>1.0</td>\n",
1699
+ " <td>138.79</td>\n",
1700
+ " <td>18.0</td>\n",
1701
+ " <td>3.0</td>\n",
1702
+ " <td>0.0</td>\n",
1703
+ " <td>0</td>\n",
1704
+ " </tr>\n",
1705
+ " </tbody>\n",
1706
+ "</table>\n",
1707
+ "<p>10 rows × 49 columns</p>\n",
1708
+ "</div>"
1709
+ ],
1710
+ "text/plain": [
1711
+ " state_x level4_x level3_x level2_x level1_x Molecular Weight_x logP_x \\\n",
1712
+ "0 solid B01AE B01A B01 B 2180.2853 -14.0 \n",
1713
+ "1 solid B01AE B01A B01 B 2180.2853 -14.0 \n",
1714
+ "2 solid B01AE B01A B01 B 2180.2853 -14.0 \n",
1715
+ "3 solid B01AE B01A B01 B 2180.2853 -14.0 \n",
1716
+ "4 solid B01AE B01A B01 B 2180.2853 -14.0 \n",
1717
+ "5 solid B01AE B01A B01 B 2180.2853 -14.0 \n",
1718
+ "6 solid B01AE B01A B01 B 2180.2853 -14.0 \n",
1719
+ "7 solid B01AE B01A B01 B 2180.2853 -14.0 \n",
1720
+ "8 solid B01AE B01A B01 B 2180.2853 -14.0 \n",
1721
+ "9 solid B01AE B01A B01 B 2180.2853 -14.0 \n",
1722
+ "\n",
1723
+ " Water Solubility_x logS_x Bioavailability_x ... \\\n",
1724
+ "0 0.0464 -4.7 0.0 ... \n",
1725
+ "1 0.0464 -4.7 0.0 ... \n",
1726
+ "2 0.0464 -4.7 0.0 ... \n",
1727
+ "3 0.0464 -4.7 0.0 ... \n",
1728
+ "4 0.0464 -4.7 0.0 ... \n",
1729
+ "5 0.0464 -4.7 0.0 ... \n",
1730
+ "6 0.0464 -4.7 0.0 ... \n",
1731
+ "7 0.0464 -4.7 0.0 ... \n",
1732
+ "8 0.0464 -4.7 0.0 ... \n",
1733
+ "9 0.0464 -4.7 0.0 ... \n",
1734
+ "\n",
1735
+ " Polar Surface Area (PSA)_y pKa (strongest basic)_y Ghose Filter_y \\\n",
1736
+ "0 901.57 11.88 0.0 \n",
1737
+ "1 429.04 11.92 0.0 \n",
1738
+ "2 495.89 10.91 0.0 \n",
1739
+ "3 519.89 NaN 0.0 \n",
1740
+ "4 435.41 11.77 0.0 \n",
1741
+ "5 495.67 11.79 0.0 \n",
1742
+ "6 702.02 9.59 0.0 \n",
1743
+ "7 424.98 10.66 0.0 \n",
1744
+ "8 116.95 4.11 0.0 \n",
1745
+ "9 477.85 8.68 0.0 \n",
1746
+ "\n",
1747
+ " Monoisotopic Weight_y MDDR-Like Rule_y Polarizability_y \\\n",
1748
+ "0 2178.985813 1.0 218.54 \n",
1749
+ "1 1208.645462 1.0 125.24 \n",
1750
+ "2 1268.641439 1.0 130.74 \n",
1751
+ "3 1810.033419 1.0 194.73 \n",
1752
+ "4 1068.426956 1.0 104.78 \n",
1753
+ "5 1429.669818 1.0 148.93 \n",
1754
+ "6 1619.710366 1.0 158.96 \n",
1755
+ "7 1414.684072 1.0 149.31 \n",
1756
+ "8 247.024574 0.0 20.90 \n",
1757
+ "9 1354.567405 1.0 138.79 \n",
1758
+ "\n",
1759
+ " H Bond Acceptor Count_y Physiological Charge_y Rule of Five_y \\\n",
1760
+ "0 37.0 -4.0 0.0 \n",
1761
+ "1 16.0 1.0 0.0 \n",
1762
+ "2 18.0 1.0 0.0 \n",
1763
+ "3 16.0 0.0 0.0 \n",
1764
+ "4 15.0 1.0 0.0 \n",
1765
+ "5 18.0 1.0 0.0 \n",
1766
+ "6 27.0 -3.0 0.0 \n",
1767
+ "7 16.0 1.0 0.0 \n",
1768
+ "8 6.0 -2.0 1.0 \n",
1769
+ "9 18.0 3.0 0.0 \n",
1770
+ "\n",
1771
+ " interaction \n",
1772
+ "0 0 \n",
1773
+ "1 0 \n",
1774
+ "2 0 \n",
1775
+ "3 0 \n",
1776
+ "4 0 \n",
1777
+ "5 0 \n",
1778
+ "6 0 \n",
1779
+ "7 0 \n",
1780
+ "8 0 \n",
1781
+ "9 0 \n",
1782
+ "\n",
1783
+ "[10 rows x 49 columns]"
1784
+ ]
1785
+ },
1786
+ "execution_count": 49,
1787
+ "metadata": {},
1788
+ "output_type": "execute_result"
1789
+ }
1790
+ ],
1791
+ "source": [
1792
+ "# drop label name_x and name_y\n",
1793
+ "catboost_df = catboost_df.drop(['name_x', 'name_y'], axis=1)\n",
1794
+ "catboost_df.head(10)"
1795
+ ]
1796
+ },
1797
+ {
1798
+ "cell_type": "code",
1799
+ "execution_count": 68,
1800
+ "metadata": {},
1801
+ "outputs": [],
1802
+ "source": [
1803
+ "# create test and train set\n",
1804
+ "from sklearn.model_selection import train_test_split\n",
1805
+ "X, y = catboost_df.drop('interaction', axis=1), catboost_df['interaction']\n",
1806
+ "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)"
1807
+ ]
1808
+ },
1809
+ {
1810
+ "cell_type": "code",
1811
+ "execution_count": 66,
1812
+ "metadata": {},
1813
+ "outputs": [
1814
+ {
1815
+ "name": "stdout",
1816
+ "output_type": "stream",
1817
+ "text": [
1818
+ "['state_x', 'level4_x', 'level3_x', 'level2_x', 'level1_x', 'state_y', 'level4_y', 'level3_y', 'level2_y', 'level1_y']\n",
1819
+ "['state_x', 'level4_x', 'level3_x', 'level2_x', 'level1_x', 'Molecular Weight_x', 'logP_x', 'Water Solubility_x', 'logS_x', 'Bioavailability_x', 'pKa (strongest acidic)_x', 'Refractivity_x', 'Number of Rings_x', 'H Bond Donor Count_x', 'Rotatable Bond Count_x', 'Polar Surface Area (PSA)_x', 'pKa (strongest basic)_x', 'Ghose Filter_x', 'Monoisotopic Weight_x', 'MDDR-Like Rule_x', 'Polarizability_x', 'H Bond Acceptor Count_x', 'Physiological Charge_x', 'Rule of Five_x', 'state_y', 'level4_y', 'level3_y', 'level2_y', 'level1_y', 'Molecular Weight_y', 'logP_y', 'Water Solubility_y', 'logS_y', 'Bioavailability_y', 'pKa (strongest acidic)_y', 'Refractivity_y', 'Number of Rings_y', 'H Bond Donor Count_y', 'Rotatable Bond Count_y', 'Polar Surface Area (PSA)_y', 'pKa (strongest basic)_y', 'Ghose Filter_y', 'Monoisotopic Weight_y', 'MDDR-Like Rule_y', 'Polarizability_y', 'H Bond Acceptor Count_y', 'Physiological Charge_y', 'Rule of Five_y', 'interaction']\n",
1820
+ "['Molecular Weight_x', 'logP_x', 'Water Solubility_x', 'logS_x', 'Bioavailability_x', 'pKa (strongest acidic)_x', 'Refractivity_x', 'Number of Rings_x', 'H Bond Donor Count_x', 'Rotatable Bond Count_x', 'Polar Surface Area (PSA)_x', 'pKa (strongest basic)_x', 'Ghose Filter_x', 'Monoisotopic Weight_x', 'MDDR-Like Rule_x', 'Polarizability_x', 'H Bond Acceptor Count_x', 'Physiological Charge_x', 'Rule of Five_x', 'Molecular Weight_y', 'logP_y', 'Water Solubility_y', 'logS_y', 'Bioavailability_y', 'pKa (strongest acidic)_y', 'Refractivity_y', 'Number of Rings_y', 'H Bond Donor Count_y', 'Rotatable Bond Count_y', 'Polar Surface Area (PSA)_y', 'pKa (strongest basic)_y', 'Ghose Filter_y', 'Monoisotopic Weight_y', 'MDDR-Like Rule_y', 'Polarizability_y', 'H Bond Acceptor Count_y', 'Physiological Charge_y', 'Rule of Five_y']\n",
1821
+ "10 + 38 = 49\n"
1822
+ ]
1823
+ }
1824
+ ],
1825
+ "source": [
1826
+ "# get all the columns whose dtype is object\n",
1827
+ "cat_features = list(catboost_df.select_dtypes(include=['object']).columns)\n",
1828
+ "print(cat_features)\n",
1829
+ "print(list(catboost_df.columns))\n",
1830
+ "float_features = list(catboost_df.select_dtypes(include=['float64']).columns)\n",
1831
+ "print(float_features)\n",
1832
+ "print(f\"{len(cat_features)} + {len(float_features)} = {len(catboost_df.columns)}\")"
1833
+ ]
1834
+ }
1835
+ ],
1836
+ "metadata": {
1837
+ "kernelspec": {
1838
+ "display_name": "Python 3",
1839
+ "language": "python",
1840
+ "name": "python3"
1841
+ },
1842
+ "language_info": {
1843
+ "codemirror_mode": {
1844
+ "name": "ipython",
1845
+ "version": 3
1846
+ },
1847
+ "file_extension": ".py",
1848
+ "mimetype": "text/x-python",
1849
+ "name": "python",
1850
+ "nbconvert_exporter": "python",
1851
+ "pygments_lexer": "ipython3",
1852
+ "version": "3.8.10"
1853
+ },
1854
+ "orig_nbformat": 4
1855
+ },
1856
+ "nbformat": 4,
1857
+ "nbformat_minor": 2
1858
+ }
catboost/train.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import pandas as pd
3
+ from sklearn.model_selection import train_test_split
4
+ from sklearn.experimental import enable_halving_search_cv
5
+ from sklearn.model_selection import HalvingGridSearchCV, RandomizedSearchCV
6
+ from catboost import CatBoostClassifier, Pool
7
+ from sklearn.metrics import roc_auc_score
8
+ from sklearn.metrics import accuracy_score
9
+ from pandas.core.common import random_state
10
+
11
+ # load catboost_df
12
+ catboost_df = pd.read_csv('datasets/catboost_df.csv', index_col=0)
13
+ # drop label name_x and name_y
14
+ catboost_df = catboost_df.drop(['name_x', 'name_y'], axis=1)
15
+
16
+ # get the categorical and float features
17
+ cat_features = list(catboost_df.select_dtypes(include=['object']).columns)
18
+ float_features = list(catboost_df.select_dtypes(include=['float64']).columns)
19
+
20
+ for feature in float_features:
21
+ # Fill NaN values with the mean of non-missing values in the same column
22
+ mean_value = catboost_df[feature].mean()
23
+ catboost_df[feature].fillna(mean_value, inplace=True)
24
+
25
+ for feature in cat_features:
26
+ catboost_df[feature] = catboost_df[feature].astype(str)
27
+
28
+ # create test and train set
29
+ X, y = catboost_df.drop('interaction', axis=1), catboost_df['interaction']
30
+ X_train, X_test, y_train, y_test = train_test_split(
31
+ X, y, test_size=0.2, random_state=42)
32
+
33
+ catb_model = CatBoostClassifier(random_state=42, task_type="GPU", max_ctr_complexity=1, boosting_type="Plain",
34
+ cat_features=cat_features, gpu_ram_part=0.4)
35
+ catb_param = {
36
+ 'max_depth': [6],
37
+ 'learning_rate': [0.01],
38
+ 'reg_lambda': [2.5],
39
+ 'n_estimators': [1000],
40
+ }
41
+
42
+
43
+ # pool_train = Pool(X_train, y_train, cat_features = cat_features)
44
+ # pool_test = Pool(X_test, cat_features = cat_features)
45
+
46
+ # grid search
47
+ grid_search = HalvingGridSearchCV(
48
+ catb_model, catb_param, cv=3, n_jobs=-1, verbose=2)
49
+ grid_search.fit(X_train, y_train)
50
+
51
+ print("Done")
52
+
53
+ best_model = grid_search.best_estimator_
54
+ best_model.save_model('models/catboost_model2.cbm')
55
+
56
+
57
+ # print best parameters
58
+ print(grid_search.best_params_)
59
+ # print best score
60
+ print(grid_search.best_score_)
61
+
62
+
63
+ y_p = grid_search.predict_proba(X_test)
64
+ print(f"Test AUC_ROC score = {roc_auc_score(y_test, y_p[:, 1])}")
65
+
66
+ print("---------------------Done--------------------------------")
dimensionality_reduction.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
link_prediction.ipynb ADDED
@@ -0,0 +1,1482 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "import pandas as pd\n",
10
+ "import numpy as np\n",
11
+ "import json"
12
+ ]
13
+ },
14
+ {
15
+ "cell_type": "code",
16
+ "execution_count": 2,
17
+ "metadata": {},
18
+ "outputs": [],
19
+ "source": [
20
+ "with open('data/interactions.json') as f:\n",
21
+ " ddi_json = json.load(f)"
22
+ ]
23
+ },
24
+ {
25
+ "cell_type": "code",
26
+ "execution_count": 3,
27
+ "metadata": {},
28
+ "outputs": [
29
+ {
30
+ "data": {
31
+ "text/html": [
32
+ "<div>\n",
33
+ "<style scoped>\n",
34
+ " .dataframe tbody tr th:only-of-type {\n",
35
+ " vertical-align: middle;\n",
36
+ " }\n",
37
+ "\n",
38
+ " .dataframe tbody tr th {\n",
39
+ " vertical-align: top;\n",
40
+ " }\n",
41
+ "\n",
42
+ " .dataframe thead th {\n",
43
+ " text-align: right;\n",
44
+ " }\n",
45
+ "</style>\n",
46
+ "<table border=\"1\" class=\"dataframe\">\n",
47
+ " <thead>\n",
48
+ " <tr style=\"text-align: right;\">\n",
49
+ " <th></th>\n",
50
+ " <th>Unnamed: 0</th>\n",
51
+ " <th>name</th>\n",
52
+ " <th>state</th>\n",
53
+ " <th>level4</th>\n",
54
+ " <th>level3</th>\n",
55
+ " <th>level2</th>\n",
56
+ " <th>level1</th>\n",
57
+ " <th>Molecular Weight</th>\n",
58
+ " <th>logP</th>\n",
59
+ " <th>Water Solubility</th>\n",
60
+ " <th>...</th>\n",
61
+ " <th>Rotatable Bond Count</th>\n",
62
+ " <th>Polar Surface Area (PSA)</th>\n",
63
+ " <th>pKa (strongest basic)</th>\n",
64
+ " <th>Ghose Filter</th>\n",
65
+ " <th>Monoisotopic Weight</th>\n",
66
+ " <th>MDDR-Like Rule</th>\n",
67
+ " <th>Polarizability</th>\n",
68
+ " <th>H Bond Acceptor Count</th>\n",
69
+ " <th>Physiological Charge</th>\n",
70
+ " <th>Rule of Five</th>\n",
71
+ " </tr>\n",
72
+ " </thead>\n",
73
+ " <tbody>\n",
74
+ " <tr>\n",
75
+ " <th>0</th>\n",
76
+ " <td>5</td>\n",
77
+ " <td>Bivalirudin</td>\n",
78
+ " <td>solid</td>\n",
79
+ " <td>B01AE</td>\n",
80
+ " <td>B01A</td>\n",
81
+ " <td>B01</td>\n",
82
+ " <td>B</td>\n",
83
+ " <td>2180.2853</td>\n",
84
+ " <td>-14.00</td>\n",
85
+ " <td>0.04640</td>\n",
86
+ " <td>...</td>\n",
87
+ " <td>66.0</td>\n",
88
+ " <td>901.57</td>\n",
89
+ " <td>11.88</td>\n",
90
+ " <td>0.0</td>\n",
91
+ " <td>2178.985813</td>\n",
92
+ " <td>1.0</td>\n",
93
+ " <td>218.54</td>\n",
94
+ " <td>37.0</td>\n",
95
+ " <td>-4.0</td>\n",
96
+ " <td>0.0</td>\n",
97
+ " </tr>\n",
98
+ " <tr>\n",
99
+ " <th>1</th>\n",
100
+ " <td>6</td>\n",
101
+ " <td>Leuprolide</td>\n",
102
+ " <td>solid</td>\n",
103
+ " <td>L02AE</td>\n",
104
+ " <td>L02A</td>\n",
105
+ " <td>L02</td>\n",
106
+ " <td>L</td>\n",
107
+ " <td>1209.3983</td>\n",
108
+ " <td>-2.40</td>\n",
109
+ " <td>0.03380</td>\n",
110
+ " <td>...</td>\n",
111
+ " <td>32.0</td>\n",
112
+ " <td>429.04</td>\n",
113
+ " <td>11.92</td>\n",
114
+ " <td>0.0</td>\n",
115
+ " <td>1208.645462</td>\n",
116
+ " <td>1.0</td>\n",
117
+ " <td>125.24</td>\n",
118
+ " <td>16.0</td>\n",
119
+ " <td>1.0</td>\n",
120
+ " <td>0.0</td>\n",
121
+ " </tr>\n",
122
+ " <tr>\n",
123
+ " <th>2</th>\n",
124
+ " <td>13</td>\n",
125
+ " <td>Goserelin</td>\n",
126
+ " <td>solid</td>\n",
127
+ " <td>L02AE</td>\n",
128
+ " <td>L02A</td>\n",
129
+ " <td>L02</td>\n",
130
+ " <td>L</td>\n",
131
+ " <td>1269.4105</td>\n",
132
+ " <td>-5.10</td>\n",
133
+ " <td>0.02830</td>\n",
134
+ " <td>...</td>\n",
135
+ " <td>33.0</td>\n",
136
+ " <td>495.89</td>\n",
137
+ " <td>10.91</td>\n",
138
+ " <td>0.0</td>\n",
139
+ " <td>1268.641439</td>\n",
140
+ " <td>1.0</td>\n",
141
+ " <td>130.74</td>\n",
142
+ " <td>18.0</td>\n",
143
+ " <td>1.0</td>\n",
144
+ " <td>0.0</td>\n",
145
+ " </tr>\n",
146
+ " <tr>\n",
147
+ " <th>3</th>\n",
148
+ " <td>25</td>\n",
149
+ " <td>Gramicidin D</td>\n",
150
+ " <td>liquid</td>\n",
151
+ " <td>R02AB</td>\n",
152
+ " <td>R02A</td>\n",
153
+ " <td>R02</td>\n",
154
+ " <td>R</td>\n",
155
+ " <td>1811.2530</td>\n",
156
+ " <td>5.96</td>\n",
157
+ " <td>0.00390</td>\n",
158
+ " <td>...</td>\n",
159
+ " <td>50.0</td>\n",
160
+ " <td>519.89</td>\n",
161
+ " <td>NaN</td>\n",
162
+ " <td>0.0</td>\n",
163
+ " <td>1810.033419</td>\n",
164
+ " <td>1.0</td>\n",
165
+ " <td>194.73</td>\n",
166
+ " <td>16.0</td>\n",
167
+ " <td>0.0</td>\n",
168
+ " <td>0.0</td>\n",
169
+ " </tr>\n",
170
+ " <tr>\n",
171
+ " <th>4</th>\n",
172
+ " <td>33</td>\n",
173
+ " <td>Desmopressin</td>\n",
174
+ " <td>solid</td>\n",
175
+ " <td>H01BA</td>\n",
176
+ " <td>H01B</td>\n",
177
+ " <td>H01</td>\n",
178
+ " <td>H</td>\n",
179
+ " <td>1069.2200</td>\n",
180
+ " <td>-6.10</td>\n",
181
+ " <td>0.11000</td>\n",
182
+ " <td>...</td>\n",
183
+ " <td>19.0</td>\n",
184
+ " <td>435.41</td>\n",
185
+ " <td>11.77</td>\n",
186
+ " <td>0.0</td>\n",
187
+ " <td>1068.426956</td>\n",
188
+ " <td>1.0</td>\n",
189
+ " <td>104.78</td>\n",
190
+ " <td>15.0</td>\n",
191
+ " <td>1.0</td>\n",
192
+ " <td>0.0</td>\n",
193
+ " </tr>\n",
194
+ " <tr>\n",
195
+ " <th>...</th>\n",
196
+ " <td>...</td>\n",
197
+ " <td>...</td>\n",
198
+ " <td>...</td>\n",
199
+ " <td>...</td>\n",
200
+ " <td>...</td>\n",
201
+ " <td>...</td>\n",
202
+ " <td>...</td>\n",
203
+ " <td>...</td>\n",
204
+ " <td>...</td>\n",
205
+ " <td>...</td>\n",
206
+ " <td>...</td>\n",
207
+ " <td>...</td>\n",
208
+ " <td>...</td>\n",
209
+ " <td>...</td>\n",
210
+ " <td>...</td>\n",
211
+ " <td>...</td>\n",
212
+ " <td>...</td>\n",
213
+ " <td>...</td>\n",
214
+ " <td>...</td>\n",
215
+ " <td>...</td>\n",
216
+ " <td>...</td>\n",
217
+ " </tr>\n",
218
+ " <tr>\n",
219
+ " <th>2625</th>\n",
220
+ " <td>14553</td>\n",
221
+ " <td>Belumosudil</td>\n",
222
+ " <td>solid</td>\n",
223
+ " <td>L04AA</td>\n",
224
+ " <td>L04A</td>\n",
225
+ " <td>L04</td>\n",
226
+ " <td>L</td>\n",
227
+ " <td>452.5180</td>\n",
228
+ " <td>4.65</td>\n",
229
+ " <td>0.00289</td>\n",
230
+ " <td>...</td>\n",
231
+ " <td>7.0</td>\n",
232
+ " <td>104.82</td>\n",
233
+ " <td>4.11</td>\n",
234
+ " <td>0.0</td>\n",
235
+ " <td>452.196074</td>\n",
236
+ " <td>1.0</td>\n",
237
+ " <td>49.55</td>\n",
238
+ " <td>6.0</td>\n",
239
+ " <td>0.0</td>\n",
240
+ " <td>1.0</td>\n",
241
+ " </tr>\n",
242
+ " <tr>\n",
243
+ " <th>2626</th>\n",
244
+ " <td>14688</td>\n",
245
+ " <td>Tebipenem pivoxil</td>\n",
246
+ " <td>NaN</td>\n",
247
+ " <td>J01DH</td>\n",
248
+ " <td>J01D</td>\n",
249
+ " <td>J01</td>\n",
250
+ " <td>J</td>\n",
251
+ " <td>497.6300</td>\n",
252
+ " <td>1.59</td>\n",
253
+ " <td>0.16700</td>\n",
254
+ " <td>...</td>\n",
255
+ " <td>9.0</td>\n",
256
+ " <td>108.74</td>\n",
257
+ " <td>6.27</td>\n",
258
+ " <td>0.0</td>\n",
259
+ " <td>497.165428</td>\n",
260
+ " <td>1.0</td>\n",
261
+ " <td>53.39</td>\n",
262
+ " <td>6.0</td>\n",
263
+ " <td>0.0</td>\n",
264
+ " <td>1.0</td>\n",
265
+ " </tr>\n",
266
+ " <tr>\n",
267
+ " <th>2627</th>\n",
268
+ " <td>14698</td>\n",
269
+ " <td>Tosufloxacin</td>\n",
270
+ " <td>NaN</td>\n",
271
+ " <td>J01MA</td>\n",
272
+ " <td>J01M</td>\n",
273
+ " <td>J01</td>\n",
274
+ " <td>J</td>\n",
275
+ " <td>404.3490</td>\n",
276
+ " <td>0.47</td>\n",
277
+ " <td>0.07620</td>\n",
278
+ " <td>...</td>\n",
279
+ " <td>3.0</td>\n",
280
+ " <td>99.76</td>\n",
281
+ " <td>9.80</td>\n",
282
+ " <td>1.0</td>\n",
283
+ " <td>404.109625</td>\n",
284
+ " <td>0.0</td>\n",
285
+ " <td>37.18</td>\n",
286
+ " <td>7.0</td>\n",
287
+ " <td>0.0</td>\n",
288
+ " <td>1.0</td>\n",
289
+ " </tr>\n",
290
+ " <tr>\n",
291
+ " <th>2628</th>\n",
292
+ " <td>14931</td>\n",
293
+ " <td>Linzagolix</td>\n",
294
+ " <td>solid</td>\n",
295
+ " <td>H01CC</td>\n",
296
+ " <td>H01C</td>\n",
297
+ " <td>H01</td>\n",
298
+ " <td>H</td>\n",
299
+ " <td>508.4200</td>\n",
300
+ " <td>3.88</td>\n",
301
+ " <td>0.00198</td>\n",
302
+ " <td>...</td>\n",
303
+ " <td>6.0</td>\n",
304
+ " <td>114.40</td>\n",
305
+ " <td>-3.50</td>\n",
306
+ " <td>0.0</td>\n",
307
+ " <td>508.055206</td>\n",
308
+ " <td>1.0</td>\n",
309
+ " <td>45.39</td>\n",
310
+ " <td>7.0</td>\n",
311
+ " <td>-1.0</td>\n",
312
+ " <td>0.0</td>\n",
313
+ " </tr>\n",
314
+ " <tr>\n",
315
+ " <th>2629</th>\n",
316
+ " <td>14995</td>\n",
317
+ " <td>Methionine C-11</td>\n",
318
+ " <td>NaN</td>\n",
319
+ " <td>V09IX</td>\n",
320
+ " <td>V09I</td>\n",
321
+ " <td>V09</td>\n",
322
+ " <td>V</td>\n",
323
+ " <td>148.2100</td>\n",
324
+ " <td>-2.20</td>\n",
325
+ " <td>23.90000</td>\n",
326
+ " <td>...</td>\n",
327
+ " <td>4.0</td>\n",
328
+ " <td>63.32</td>\n",
329
+ " <td>9.50</td>\n",
330
+ " <td>0.0</td>\n",
331
+ " <td>148.062484</td>\n",
332
+ " <td>0.0</td>\n",
333
+ " <td>15.54</td>\n",
334
+ " <td>3.0</td>\n",
335
+ " <td>0.0</td>\n",
336
+ " <td>1.0</td>\n",
337
+ " </tr>\n",
338
+ " </tbody>\n",
339
+ "</table>\n",
340
+ "<p>2630 rows × 26 columns</p>\n",
341
+ "</div>"
342
+ ],
343
+ "text/plain": [
344
+ " Unnamed: 0 name state level4 level3 level2 level1 \\\n",
345
+ "0 5 Bivalirudin solid B01AE B01A B01 B \n",
346
+ "1 6 Leuprolide solid L02AE L02A L02 L \n",
347
+ "2 13 Goserelin solid L02AE L02A L02 L \n",
348
+ "3 25 Gramicidin D liquid R02AB R02A R02 R \n",
349
+ "4 33 Desmopressin solid H01BA H01B H01 H \n",
350
+ "... ... ... ... ... ... ... ... \n",
351
+ "2625 14553 Belumosudil solid L04AA L04A L04 L \n",
352
+ "2626 14688 Tebipenem pivoxil NaN J01DH J01D J01 J \n",
353
+ "2627 14698 Tosufloxacin NaN J01MA J01M J01 J \n",
354
+ "2628 14931 Linzagolix solid H01CC H01C H01 H \n",
355
+ "2629 14995 Methionine C-11 NaN V09IX V09I V09 V \n",
356
+ "\n",
357
+ " Molecular Weight logP Water Solubility ... Rotatable Bond Count \\\n",
358
+ "0 2180.2853 -14.00 0.04640 ... 66.0 \n",
359
+ "1 1209.3983 -2.40 0.03380 ... 32.0 \n",
360
+ "2 1269.4105 -5.10 0.02830 ... 33.0 \n",
361
+ "3 1811.2530 5.96 0.00390 ... 50.0 \n",
362
+ "4 1069.2200 -6.10 0.11000 ... 19.0 \n",
363
+ "... ... ... ... ... ... \n",
364
+ "2625 452.5180 4.65 0.00289 ... 7.0 \n",
365
+ "2626 497.6300 1.59 0.16700 ... 9.0 \n",
366
+ "2627 404.3490 0.47 0.07620 ... 3.0 \n",
367
+ "2628 508.4200 3.88 0.00198 ... 6.0 \n",
368
+ "2629 148.2100 -2.20 23.90000 ... 4.0 \n",
369
+ "\n",
370
+ " Polar Surface Area (PSA) pKa (strongest basic) Ghose Filter \\\n",
371
+ "0 901.57 11.88 0.0 \n",
372
+ "1 429.04 11.92 0.0 \n",
373
+ "2 495.89 10.91 0.0 \n",
374
+ "3 519.89 NaN 0.0 \n",
375
+ "4 435.41 11.77 0.0 \n",
376
+ "... ... ... ... \n",
377
+ "2625 104.82 4.11 0.0 \n",
378
+ "2626 108.74 6.27 0.0 \n",
379
+ "2627 99.76 9.80 1.0 \n",
380
+ "2628 114.40 -3.50 0.0 \n",
381
+ "2629 63.32 9.50 0.0 \n",
382
+ "\n",
383
+ " Monoisotopic Weight MDDR-Like Rule Polarizability \\\n",
384
+ "0 2178.985813 1.0 218.54 \n",
385
+ "1 1208.645462 1.0 125.24 \n",
386
+ "2 1268.641439 1.0 130.74 \n",
387
+ "3 1810.033419 1.0 194.73 \n",
388
+ "4 1068.426956 1.0 104.78 \n",
389
+ "... ... ... ... \n",
390
+ "2625 452.196074 1.0 49.55 \n",
391
+ "2626 497.165428 1.0 53.39 \n",
392
+ "2627 404.109625 0.0 37.18 \n",
393
+ "2628 508.055206 1.0 45.39 \n",
394
+ "2629 148.062484 0.0 15.54 \n",
395
+ "\n",
396
+ " H Bond Acceptor Count Physiological Charge Rule of Five \n",
397
+ "0 37.0 -4.0 0.0 \n",
398
+ "1 16.0 1.0 0.0 \n",
399
+ "2 18.0 1.0 0.0 \n",
400
+ "3 16.0 0.0 0.0 \n",
401
+ "4 15.0 1.0 0.0 \n",
402
+ "... ... ... ... \n",
403
+ "2625 6.0 0.0 1.0 \n",
404
+ "2626 6.0 0.0 1.0 \n",
405
+ "2627 7.0 0.0 1.0 \n",
406
+ "2628 7.0 -1.0 0.0 \n",
407
+ "2629 3.0 0.0 1.0 \n",
408
+ "\n",
409
+ "[2630 rows x 26 columns]"
410
+ ]
411
+ },
412
+ "execution_count": 3,
413
+ "metadata": {},
414
+ "output_type": "execute_result"
415
+ }
416
+ ],
417
+ "source": [
418
+ "df_drugs = pd.read_csv('data/filtered_dataset.csv')\n",
419
+ "df_drugs"
420
+ ]
421
+ },
422
+ {
423
+ "cell_type": "code",
424
+ "execution_count": 4,
425
+ "metadata": {},
426
+ "outputs": [],
427
+ "source": [
428
+ "def adjacency_matrix(df):\n",
429
+ " # create a matrix of zeros with the same shape as the final adjacency matrix\n",
430
+ " matrix = np.zeros((len(df), len(df)), dtype=int)\n",
431
+ "\n",
432
+ " # loop through each drug and set the corresponding values in the matrix to 1\n",
433
+ " for i, drug in enumerate(df['name']):\n",
434
+ " interacting_drugs = ddi_json[drug]\n",
435
+ " indices = df.index[df['name'].isin(interacting_drugs)].tolist()\n",
436
+ " matrix[i, indices] = 1\n",
437
+ "\n",
438
+ " # convert the matrix to a dataframe and set the column names and index\n",
439
+ " df_matrix = pd.DataFrame(matrix, columns=df['name'], index=df['name'])\n",
440
+ "\n",
441
+ " return df_matrix\n",
442
+ "\n",
443
+ "df_matrix = adjacency_matrix(df_drugs)"
444
+ ]
445
+ },
446
+ {
447
+ "cell_type": "code",
448
+ "execution_count": 5,
449
+ "metadata": {},
450
+ "outputs": [
451
+ {
452
+ "data": {
453
+ "text/html": [
454
+ "<div>\n",
455
+ "<style scoped>\n",
456
+ " .dataframe tbody tr th:only-of-type {\n",
457
+ " vertical-align: middle;\n",
458
+ " }\n",
459
+ "\n",
460
+ " .dataframe tbody tr th {\n",
461
+ " vertical-align: top;\n",
462
+ " }\n",
463
+ "\n",
464
+ " .dataframe thead th {\n",
465
+ " text-align: right;\n",
466
+ " }\n",
467
+ "</style>\n",
468
+ "<table border=\"1\" class=\"dataframe\">\n",
469
+ " <thead>\n",
470
+ " <tr style=\"text-align: right;\">\n",
471
+ " <th>name</th>\n",
472
+ " <th>Bivalirudin</th>\n",
473
+ " <th>Leuprolide</th>\n",
474
+ " <th>Goserelin</th>\n",
475
+ " <th>Gramicidin D</th>\n",
476
+ " <th>Desmopressin</th>\n",
477
+ " <th>Cetrorelix</th>\n",
478
+ " <th>Daptomycin</th>\n",
479
+ " <th>Abarelix</th>\n",
480
+ " <th>Pyridoxal phosphate</th>\n",
481
+ " <th>Cyanocobalamin</th>\n",
482
+ " <th>...</th>\n",
483
+ " <th>Naphthoquine</th>\n",
484
+ " <th>Odevixibat</th>\n",
485
+ " <th>Melphalan flufenamide</th>\n",
486
+ " <th>Deucravacitinib</th>\n",
487
+ " <th>Tegoprazan</th>\n",
488
+ " <th>Belumosudil</th>\n",
489
+ " <th>Tebipenem pivoxil</th>\n",
490
+ " <th>Tosufloxacin</th>\n",
491
+ " <th>Linzagolix</th>\n",
492
+ " <th>Methionine C-11</th>\n",
493
+ " </tr>\n",
494
+ " <tr>\n",
495
+ " <th>name</th>\n",
496
+ " <th></th>\n",
497
+ " <th></th>\n",
498
+ " <th></th>\n",
499
+ " <th></th>\n",
500
+ " <th></th>\n",
501
+ " <th></th>\n",
502
+ " <th></th>\n",
503
+ " <th></th>\n",
504
+ " <th></th>\n",
505
+ " <th></th>\n",
506
+ " <th></th>\n",
507
+ " <th></th>\n",
508
+ " <th></th>\n",
509
+ " <th></th>\n",
510
+ " <th></th>\n",
511
+ " <th></th>\n",
512
+ " <th></th>\n",
513
+ " <th></th>\n",
514
+ " <th></th>\n",
515
+ " <th></th>\n",
516
+ " <th></th>\n",
517
+ " </tr>\n",
518
+ " </thead>\n",
519
+ " <tbody>\n",
520
+ " <tr>\n",
521
+ " <th>Bivalirudin</th>\n",
522
+ " <td>0</td>\n",
523
+ " <td>0</td>\n",
524
+ " <td>0</td>\n",
525
+ " <td>0</td>\n",
526
+ " <td>0</td>\n",
527
+ " <td>0</td>\n",
528
+ " <td>0</td>\n",
529
+ " <td>0</td>\n",
530
+ " <td>0</td>\n",
531
+ " <td>0</td>\n",
532
+ " <td>...</td>\n",
533
+ " <td>0</td>\n",
534
+ " <td>0</td>\n",
535
+ " <td>0</td>\n",
536
+ " <td>0</td>\n",
537
+ " <td>0</td>\n",
538
+ " <td>0</td>\n",
539
+ " <td>0</td>\n",
540
+ " <td>0</td>\n",
541
+ " <td>0</td>\n",
542
+ " <td>0</td>\n",
543
+ " </tr>\n",
544
+ " <tr>\n",
545
+ " <th>Leuprolide</th>\n",
546
+ " <td>0</td>\n",
547
+ " <td>0</td>\n",
548
+ " <td>1</td>\n",
549
+ " <td>0</td>\n",
550
+ " <td>1</td>\n",
551
+ " <td>0</td>\n",
552
+ " <td>1</td>\n",
553
+ " <td>0</td>\n",
554
+ " <td>0</td>\n",
555
+ " <td>1</td>\n",
556
+ " <td>...</td>\n",
557
+ " <td>0</td>\n",
558
+ " <td>0</td>\n",
559
+ " <td>0</td>\n",
560
+ " <td>0</td>\n",
561
+ " <td>0</td>\n",
562
+ " <td>0</td>\n",
563
+ " <td>0</td>\n",
564
+ " <td>0</td>\n",
565
+ " <td>0</td>\n",
566
+ " <td>0</td>\n",
567
+ " </tr>\n",
568
+ " <tr>\n",
569
+ " <th>Goserelin</th>\n",
570
+ " <td>0</td>\n",
571
+ " <td>1</td>\n",
572
+ " <td>0</td>\n",
573
+ " <td>0</td>\n",
574
+ " <td>1</td>\n",
575
+ " <td>0</td>\n",
576
+ " <td>1</td>\n",
577
+ " <td>0</td>\n",
578
+ " <td>0</td>\n",
579
+ " <td>1</td>\n",
580
+ " <td>...</td>\n",
581
+ " <td>0</td>\n",
582
+ " <td>0</td>\n",
583
+ " <td>0</td>\n",
584
+ " <td>0</td>\n",
585
+ " <td>0</td>\n",
586
+ " <td>0</td>\n",
587
+ " <td>0</td>\n",
588
+ " <td>0</td>\n",
589
+ " <td>0</td>\n",
590
+ " <td>0</td>\n",
591
+ " </tr>\n",
592
+ " <tr>\n",
593
+ " <th>Gramicidin D</th>\n",
594
+ " <td>0</td>\n",
595
+ " <td>0</td>\n",
596
+ " <td>0</td>\n",
597
+ " <td>0</td>\n",
598
+ " <td>0</td>\n",
599
+ " <td>0</td>\n",
600
+ " <td>0</td>\n",
601
+ " <td>0</td>\n",
602
+ " <td>0</td>\n",
603
+ " <td>0</td>\n",
604
+ " <td>...</td>\n",
605
+ " <td>0</td>\n",
606
+ " <td>0</td>\n",
607
+ " <td>0</td>\n",
608
+ " <td>0</td>\n",
609
+ " <td>0</td>\n",
610
+ " <td>0</td>\n",
611
+ " <td>0</td>\n",
612
+ " <td>0</td>\n",
613
+ " <td>0</td>\n",
614
+ " <td>0</td>\n",
615
+ " </tr>\n",
616
+ " <tr>\n",
617
+ " <th>Desmopressin</th>\n",
618
+ " <td>0</td>\n",
619
+ " <td>1</td>\n",
620
+ " <td>1</td>\n",
621
+ " <td>0</td>\n",
622
+ " <td>0</td>\n",
623
+ " <td>0</td>\n",
624
+ " <td>1</td>\n",
625
+ " <td>0</td>\n",
626
+ " <td>0</td>\n",
627
+ " <td>1</td>\n",
628
+ " <td>...</td>\n",
629
+ " <td>0</td>\n",
630
+ " <td>0</td>\n",
631
+ " <td>0</td>\n",
632
+ " <td>0</td>\n",
633
+ " <td>0</td>\n",
634
+ " <td>0</td>\n",
635
+ " <td>0</td>\n",
636
+ " <td>0</td>\n",
637
+ " <td>0</td>\n",
638
+ " <td>0</td>\n",
639
+ " </tr>\n",
640
+ " <tr>\n",
641
+ " <th>...</th>\n",
642
+ " <td>...</td>\n",
643
+ " <td>...</td>\n",
644
+ " <td>...</td>\n",
645
+ " <td>...</td>\n",
646
+ " <td>...</td>\n",
647
+ " <td>...</td>\n",
648
+ " <td>...</td>\n",
649
+ " <td>...</td>\n",
650
+ " <td>...</td>\n",
651
+ " <td>...</td>\n",
652
+ " <td>...</td>\n",
653
+ " <td>...</td>\n",
654
+ " <td>...</td>\n",
655
+ " <td>...</td>\n",
656
+ " <td>...</td>\n",
657
+ " <td>...</td>\n",
658
+ " <td>...</td>\n",
659
+ " <td>...</td>\n",
660
+ " <td>...</td>\n",
661
+ " <td>...</td>\n",
662
+ " <td>...</td>\n",
663
+ " </tr>\n",
664
+ " <tr>\n",
665
+ " <th>Belumosudil</th>\n",
666
+ " <td>0</td>\n",
667
+ " <td>0</td>\n",
668
+ " <td>0</td>\n",
669
+ " <td>0</td>\n",
670
+ " <td>0</td>\n",
671
+ " <td>0</td>\n",
672
+ " <td>1</td>\n",
673
+ " <td>0</td>\n",
674
+ " <td>0</td>\n",
675
+ " <td>0</td>\n",
676
+ " <td>...</td>\n",
677
+ " <td>0</td>\n",
678
+ " <td>0</td>\n",
679
+ " <td>0</td>\n",
680
+ " <td>1</td>\n",
681
+ " <td>0</td>\n",
682
+ " <td>0</td>\n",
683
+ " <td>0</td>\n",
684
+ " <td>0</td>\n",
685
+ " <td>0</td>\n",
686
+ " <td>0</td>\n",
687
+ " </tr>\n",
688
+ " <tr>\n",
689
+ " <th>Tebipenem pivoxil</th>\n",
690
+ " <td>0</td>\n",
691
+ " <td>0</td>\n",
692
+ " <td>0</td>\n",
693
+ " <td>0</td>\n",
694
+ " <td>0</td>\n",
695
+ " <td>0</td>\n",
696
+ " <td>0</td>\n",
697
+ " <td>0</td>\n",
698
+ " <td>0</td>\n",
699
+ " <td>0</td>\n",
700
+ " <td>...</td>\n",
701
+ " <td>0</td>\n",
702
+ " <td>0</td>\n",
703
+ " <td>0</td>\n",
704
+ " <td>0</td>\n",
705
+ " <td>0</td>\n",
706
+ " <td>0</td>\n",
707
+ " <td>0</td>\n",
708
+ " <td>0</td>\n",
709
+ " <td>0</td>\n",
710
+ " <td>0</td>\n",
711
+ " </tr>\n",
712
+ " <tr>\n",
713
+ " <th>Tosufloxacin</th>\n",
714
+ " <td>0</td>\n",
715
+ " <td>0</td>\n",
716
+ " <td>0</td>\n",
717
+ " <td>0</td>\n",
718
+ " <td>0</td>\n",
719
+ " <td>0</td>\n",
720
+ " <td>0</td>\n",
721
+ " <td>0</td>\n",
722
+ " <td>0</td>\n",
723
+ " <td>0</td>\n",
724
+ " <td>...</td>\n",
725
+ " <td>0</td>\n",
726
+ " <td>0</td>\n",
727
+ " <td>0</td>\n",
728
+ " <td>0</td>\n",
729
+ " <td>0</td>\n",
730
+ " <td>0</td>\n",
731
+ " <td>0</td>\n",
732
+ " <td>0</td>\n",
733
+ " <td>0</td>\n",
734
+ " <td>0</td>\n",
735
+ " </tr>\n",
736
+ " <tr>\n",
737
+ " <th>Linzagolix</th>\n",
738
+ " <td>0</td>\n",
739
+ " <td>0</td>\n",
740
+ " <td>0</td>\n",
741
+ " <td>0</td>\n",
742
+ " <td>0</td>\n",
743
+ " <td>0</td>\n",
744
+ " <td>0</td>\n",
745
+ " <td>0</td>\n",
746
+ " <td>0</td>\n",
747
+ " <td>0</td>\n",
748
+ " <td>...</td>\n",
749
+ " <td>0</td>\n",
750
+ " <td>0</td>\n",
751
+ " <td>0</td>\n",
752
+ " <td>0</td>\n",
753
+ " <td>0</td>\n",
754
+ " <td>0</td>\n",
755
+ " <td>0</td>\n",
756
+ " <td>0</td>\n",
757
+ " <td>0</td>\n",
758
+ " <td>0</td>\n",
759
+ " </tr>\n",
760
+ " <tr>\n",
761
+ " <th>Methionine C-11</th>\n",
762
+ " <td>0</td>\n",
763
+ " <td>0</td>\n",
764
+ " <td>0</td>\n",
765
+ " <td>0</td>\n",
766
+ " <td>0</td>\n",
767
+ " <td>0</td>\n",
768
+ " <td>0</td>\n",
769
+ " <td>0</td>\n",
770
+ " <td>0</td>\n",
771
+ " <td>0</td>\n",
772
+ " <td>...</td>\n",
773
+ " <td>0</td>\n",
774
+ " <td>0</td>\n",
775
+ " <td>0</td>\n",
776
+ " <td>0</td>\n",
777
+ " <td>0</td>\n",
778
+ " <td>0</td>\n",
779
+ " <td>0</td>\n",
780
+ " <td>0</td>\n",
781
+ " <td>0</td>\n",
782
+ " <td>0</td>\n",
783
+ " </tr>\n",
784
+ " </tbody>\n",
785
+ "</table>\n",
786
+ "<p>2630 rows × 2630 columns</p>\n",
787
+ "</div>"
788
+ ],
789
+ "text/plain": [
790
+ "name Bivalirudin Leuprolide Goserelin Gramicidin D \\\n",
791
+ "name \n",
792
+ "Bivalirudin 0 0 0 0 \n",
793
+ "Leuprolide 0 0 1 0 \n",
794
+ "Goserelin 0 1 0 0 \n",
795
+ "Gramicidin D 0 0 0 0 \n",
796
+ "Desmopressin 0 1 1 0 \n",
797
+ "... ... ... ... ... \n",
798
+ "Belumosudil 0 0 0 0 \n",
799
+ "Tebipenem pivoxil 0 0 0 0 \n",
800
+ "Tosufloxacin 0 0 0 0 \n",
801
+ "Linzagolix 0 0 0 0 \n",
802
+ "Methionine C-11 0 0 0 0 \n",
803
+ "\n",
804
+ "name Desmopressin Cetrorelix Daptomycin Abarelix \\\n",
805
+ "name \n",
806
+ "Bivalirudin 0 0 0 0 \n",
807
+ "Leuprolide 1 0 1 0 \n",
808
+ "Goserelin 1 0 1 0 \n",
809
+ "Gramicidin D 0 0 0 0 \n",
810
+ "Desmopressin 0 0 1 0 \n",
811
+ "... ... ... ... ... \n",
812
+ "Belumosudil 0 0 1 0 \n",
813
+ "Tebipenem pivoxil 0 0 0 0 \n",
814
+ "Tosufloxacin 0 0 0 0 \n",
815
+ "Linzagolix 0 0 0 0 \n",
816
+ "Methionine C-11 0 0 0 0 \n",
817
+ "\n",
818
+ "name Pyridoxal phosphate Cyanocobalamin ... Naphthoquine \\\n",
819
+ "name ... \n",
820
+ "Bivalirudin 0 0 ... 0 \n",
821
+ "Leuprolide 0 1 ... 0 \n",
822
+ "Goserelin 0 1 ... 0 \n",
823
+ "Gramicidin D 0 0 ... 0 \n",
824
+ "Desmopressin 0 1 ... 0 \n",
825
+ "... ... ... ... ... \n",
826
+ "Belumosudil 0 0 ... 0 \n",
827
+ "Tebipenem pivoxil 0 0 ... 0 \n",
828
+ "Tosufloxacin 0 0 ... 0 \n",
829
+ "Linzagolix 0 0 ... 0 \n",
830
+ "Methionine C-11 0 0 ... 0 \n",
831
+ "\n",
832
+ "name Odevixibat Melphalan flufenamide Deucravacitinib \\\n",
833
+ "name \n",
834
+ "Bivalirudin 0 0 0 \n",
835
+ "Leuprolide 0 0 0 \n",
836
+ "Goserelin 0 0 0 \n",
837
+ "Gramicidin D 0 0 0 \n",
838
+ "Desmopressin 0 0 0 \n",
839
+ "... ... ... ... \n",
840
+ "Belumosudil 0 0 1 \n",
841
+ "Tebipenem pivoxil 0 0 0 \n",
842
+ "Tosufloxacin 0 0 0 \n",
843
+ "Linzagolix 0 0 0 \n",
844
+ "Methionine C-11 0 0 0 \n",
845
+ "\n",
846
+ "name Tegoprazan Belumosudil Tebipenem pivoxil Tosufloxacin \\\n",
847
+ "name \n",
848
+ "Bivalirudin 0 0 0 0 \n",
849
+ "Leuprolide 0 0 0 0 \n",
850
+ "Goserelin 0 0 0 0 \n",
851
+ "Gramicidin D 0 0 0 0 \n",
852
+ "Desmopressin 0 0 0 0 \n",
853
+ "... ... ... ... ... \n",
854
+ "Belumosudil 0 0 0 0 \n",
855
+ "Tebipenem pivoxil 0 0 0 0 \n",
856
+ "Tosufloxacin 0 0 0 0 \n",
857
+ "Linzagolix 0 0 0 0 \n",
858
+ "Methionine C-11 0 0 0 0 \n",
859
+ "\n",
860
+ "name Linzagolix Methionine C-11 \n",
861
+ "name \n",
862
+ "Bivalirudin 0 0 \n",
863
+ "Leuprolide 0 0 \n",
864
+ "Goserelin 0 0 \n",
865
+ "Gramicidin D 0 0 \n",
866
+ "Desmopressin 0 0 \n",
867
+ "... ... ... \n",
868
+ "Belumosudil 0 0 \n",
869
+ "Tebipenem pivoxil 0 0 \n",
870
+ "Tosufloxacin 0 0 \n",
871
+ "Linzagolix 0 0 \n",
872
+ "Methionine C-11 0 0 \n",
873
+ "\n",
874
+ "[2630 rows x 2630 columns]"
875
+ ]
876
+ },
877
+ "execution_count": 5,
878
+ "metadata": {},
879
+ "output_type": "execute_result"
880
+ }
881
+ ],
882
+ "source": [
883
+ "df_matrix"
884
+ ]
885
+ },
886
+ {
887
+ "cell_type": "code",
888
+ "execution_count": 6,
889
+ "metadata": {},
890
+ "outputs": [],
891
+ "source": [
892
+ "import itertools\n",
893
+ "def random_drug_pairs(names, p, random_state=None):\n",
894
+ " \"\"\"\n",
895
+ " Selects p% of all possible pairs of names selected at random from a pandas series of names.\n",
896
+ " \n",
897
+ " Parameters:\n",
898
+ " names (pandas.Series): A pandas series of names.\n",
899
+ " p (float): The percentage of pairs to select (between 0 and 1).\n",
900
+ " random_state (int, optional): Seed for the random number generator.\n",
901
+ " \n",
902
+ " Returns:\n",
903
+ " pandas.Series: A pandas series of selected pairs of names.\n",
904
+ " \"\"\"\n",
905
+ " # Calculate the total number of possible pairs\n",
906
+ " num_pairs = int(len(names) * (len(names) - 1) / 2)\n",
907
+ "\n",
908
+ " # Calculate the number of pairs to select\n",
909
+ " num_selected_pairs = int(p * num_pairs)\n",
910
+ "\n",
911
+ " # Generate all possible pairs of names using itertools\n",
912
+ " all_pairs = list(itertools.combinations(names, 2))\n",
913
+ "\n",
914
+ " # Select a random subset of pairs\n",
915
+ " selected_pairs = pd.Series(all_pairs).sample(n=num_selected_pairs, random_state=random_state)\n",
916
+ "\n",
917
+ " # Return the selected pairs\n",
918
+ " return selected_pairs\n",
919
+ "\n",
920
+ "def exclude_pairs_from_adjacency_matrix(df_matrix, excluded_pairs):\n",
921
+ " col_inds1 = np.array([df_matrix.columns.get_loc(drug1) for drug1, _ in excluded_pairs])\n",
922
+ " col_inds2 = np.array([df_matrix.columns.get_loc(drug2) for _, drug2 in excluded_pairs])\n",
923
+ " values = df_matrix.values[col_inds1, col_inds2]\n",
924
+ " df_matrix.values[col_inds1, col_inds2] = 0\n",
925
+ " df_matrix.values[col_inds2, col_inds1] = 0\n",
926
+ " return values\n",
927
+ "\n",
928
+ "def get_pairs_from_adjacency_matrix(df_matrix, excluded_pairs):\n",
929
+ " col_inds1 = np.array([df_matrix.columns.get_loc(drug1) for drug1, _ in excluded_pairs])\n",
930
+ " col_inds2 = np.array([df_matrix.columns.get_loc(drug2) for _, drug2 in excluded_pairs])\n",
931
+ " values = df_matrix.values[col_inds1, col_inds2]\n",
932
+ " return values"
933
+ ]
934
+ },
935
+ {
936
+ "cell_type": "code",
937
+ "execution_count": 7,
938
+ "metadata": {},
939
+ "outputs": [],
940
+ "source": [
941
+ "# train, test split \n",
942
+ "# train set : remove selected pairs from adj matrix\n",
943
+ "# test set: the excluded pairs\n",
944
+ "excluded_pairs = random_drug_pairs(df_drugs['name'], 0.20, 42)\n",
945
+ "excluded_pair_values = exclude_pairs_from_adjacency_matrix(df_matrix, excluded_pairs)"
946
+ ]
947
+ },
948
+ {
949
+ "cell_type": "code",
950
+ "execution_count": 8,
951
+ "metadata": {},
952
+ "outputs": [
953
+ {
954
+ "name": "stderr",
955
+ "output_type": "stream",
956
+ "text": [
957
+ "c:\\Users\\Georg\\anaconda3\\lib\\site-packages\\sknetwork\\utils\\check.py:216: Warning: The number of neighbors must be lower than the number of nodes with known labels. Changed accordingly.\n",
958
+ " warnings.warn(Warning(\"The number of neighbors must be lower than the number of nodes with known labels. \"\n"
959
+ ]
960
+ }
961
+ ],
962
+ "source": [
963
+ "from sknetwork.linkpred import NNLinker\n",
964
+ "from sknetwork.visualization import svg_graph, svg_bigraph\n",
965
+ "linker = NNLinker(n_neighbors=2630, threshold=0)\n",
966
+ "links = linker.fit_predict(df_matrix.to_numpy())"
967
+ ]
968
+ },
969
+ {
970
+ "cell_type": "code",
971
+ "execution_count": 9,
972
+ "metadata": {},
973
+ "outputs": [
974
+ {
975
+ "data": {
976
+ "text/html": [
977
+ "<div>\n",
978
+ "<style scoped>\n",
979
+ " .dataframe tbody tr th:only-of-type {\n",
980
+ " vertical-align: middle;\n",
981
+ " }\n",
982
+ "\n",
983
+ " .dataframe tbody tr th {\n",
984
+ " vertical-align: top;\n",
985
+ " }\n",
986
+ "\n",
987
+ " .dataframe thead th {\n",
988
+ " text-align: right;\n",
989
+ " }\n",
990
+ "</style>\n",
991
+ "<table border=\"1\" class=\"dataframe\">\n",
992
+ " <thead>\n",
993
+ " <tr style=\"text-align: right;\">\n",
994
+ " <th>name</th>\n",
995
+ " <th>Bivalirudin</th>\n",
996
+ " <th>Leuprolide</th>\n",
997
+ " <th>Goserelin</th>\n",
998
+ " <th>Gramicidin D</th>\n",
999
+ " <th>Desmopressin</th>\n",
1000
+ " <th>Cetrorelix</th>\n",
1001
+ " <th>Daptomycin</th>\n",
1002
+ " <th>Abarelix</th>\n",
1003
+ " <th>Pyridoxal phosphate</th>\n",
1004
+ " <th>Cyanocobalamin</th>\n",
1005
+ " <th>...</th>\n",
1006
+ " <th>Naphthoquine</th>\n",
1007
+ " <th>Odevixibat</th>\n",
1008
+ " <th>Melphalan flufenamide</th>\n",
1009
+ " <th>Deucravacitinib</th>\n",
1010
+ " <th>Tegoprazan</th>\n",
1011
+ " <th>Belumosudil</th>\n",
1012
+ " <th>Tebipenem pivoxil</th>\n",
1013
+ " <th>Tosufloxacin</th>\n",
1014
+ " <th>Linzagolix</th>\n",
1015
+ " <th>Methionine C-11</th>\n",
1016
+ " </tr>\n",
1017
+ " </thead>\n",
1018
+ " <tbody>\n",
1019
+ " <tr>\n",
1020
+ " <th>0</th>\n",
1021
+ " <td>1.000000</td>\n",
1022
+ " <td>0.306150</td>\n",
1023
+ " <td>0.288741</td>\n",
1024
+ " <td>0.083090</td>\n",
1025
+ " <td>0.310237</td>\n",
1026
+ " <td>0.0</td>\n",
1027
+ " <td>0.338457</td>\n",
1028
+ " <td>0.0</td>\n",
1029
+ " <td>0.0</td>\n",
1030
+ " <td>0.430352</td>\n",
1031
+ " <td>...</td>\n",
1032
+ " <td>0.0</td>\n",
1033
+ " <td>0.0</td>\n",
1034
+ " <td>0.0</td>\n",
1035
+ " <td>0.305532</td>\n",
1036
+ " <td>0.0</td>\n",
1037
+ " <td>0.241866</td>\n",
1038
+ " <td>0.0</td>\n",
1039
+ " <td>0.0</td>\n",
1040
+ " <td>0.105519</td>\n",
1041
+ " <td>0.0</td>\n",
1042
+ " </tr>\n",
1043
+ " <tr>\n",
1044
+ " <th>1</th>\n",
1045
+ " <td>0.306150</td>\n",
1046
+ " <td>1.000000</td>\n",
1047
+ " <td>0.780471</td>\n",
1048
+ " <td>0.087487</td>\n",
1049
+ " <td>0.587482</td>\n",
1050
+ " <td>0.0</td>\n",
1051
+ " <td>0.645092</td>\n",
1052
+ " <td>0.0</td>\n",
1053
+ " <td>0.0</td>\n",
1054
+ " <td>0.601979</td>\n",
1055
+ " <td>...</td>\n",
1056
+ " <td>0.0</td>\n",
1057
+ " <td>0.0</td>\n",
1058
+ " <td>0.0</td>\n",
1059
+ " <td>0.167036</td>\n",
1060
+ " <td>0.0</td>\n",
1061
+ " <td>0.242198</td>\n",
1062
+ " <td>0.0</td>\n",
1063
+ " <td>0.0</td>\n",
1064
+ " <td>0.156851</td>\n",
1065
+ " <td>0.0</td>\n",
1066
+ " </tr>\n",
1067
+ " <tr>\n",
1068
+ " <th>2</th>\n",
1069
+ " <td>0.288741</td>\n",
1070
+ " <td>0.780471</td>\n",
1071
+ " <td>1.000000</td>\n",
1072
+ " <td>0.110648</td>\n",
1073
+ " <td>0.603694</td>\n",
1074
+ " <td>0.0</td>\n",
1075
+ " <td>0.614604</td>\n",
1076
+ " <td>0.0</td>\n",
1077
+ " <td>0.0</td>\n",
1078
+ " <td>0.608478</td>\n",
1079
+ " <td>...</td>\n",
1080
+ " <td>0.0</td>\n",
1081
+ " <td>0.0</td>\n",
1082
+ " <td>0.0</td>\n",
1083
+ " <td>0.123966</td>\n",
1084
+ " <td>0.0</td>\n",
1085
+ " <td>0.217510</td>\n",
1086
+ " <td>0.0</td>\n",
1087
+ " <td>0.0</td>\n",
1088
+ " <td>0.123984</td>\n",
1089
+ " <td>0.0</td>\n",
1090
+ " </tr>\n",
1091
+ " <tr>\n",
1092
+ " <th>3</th>\n",
1093
+ " <td>0.083090</td>\n",
1094
+ " <td>0.087487</td>\n",
1095
+ " <td>0.110648</td>\n",
1096
+ " <td>1.000000</td>\n",
1097
+ " <td>0.028047</td>\n",
1098
+ " <td>0.0</td>\n",
1099
+ " <td>0.190443</td>\n",
1100
+ " <td>0.0</td>\n",
1101
+ " <td>0.0</td>\n",
1102
+ " <td>0.032498</td>\n",
1103
+ " <td>...</td>\n",
1104
+ " <td>0.0</td>\n",
1105
+ " <td>0.0</td>\n",
1106
+ " <td>0.0</td>\n",
1107
+ " <td>0.000000</td>\n",
1108
+ " <td>0.0</td>\n",
1109
+ " <td>0.000000</td>\n",
1110
+ " <td>0.0</td>\n",
1111
+ " <td>0.0</td>\n",
1112
+ " <td>0.039841</td>\n",
1113
+ " <td>0.0</td>\n",
1114
+ " </tr>\n",
1115
+ " <tr>\n",
1116
+ " <th>4</th>\n",
1117
+ " <td>0.310237</td>\n",
1118
+ " <td>0.587482</td>\n",
1119
+ " <td>0.603694</td>\n",
1120
+ " <td>0.028047</td>\n",
1121
+ " <td>1.000000</td>\n",
1122
+ " <td>0.0</td>\n",
1123
+ " <td>0.563808</td>\n",
1124
+ " <td>0.0</td>\n",
1125
+ " <td>0.0</td>\n",
1126
+ " <td>0.630617</td>\n",
1127
+ " <td>...</td>\n",
1128
+ " <td>0.0</td>\n",
1129
+ " <td>0.0</td>\n",
1130
+ " <td>0.0</td>\n",
1131
+ " <td>0.099909</td>\n",
1132
+ " <td>0.0</td>\n",
1133
+ " <td>0.178126</td>\n",
1134
+ " <td>0.0</td>\n",
1135
+ " <td>0.0</td>\n",
1136
+ " <td>0.087996</td>\n",
1137
+ " <td>0.0</td>\n",
1138
+ " </tr>\n",
1139
+ " <tr>\n",
1140
+ " <th>...</th>\n",
1141
+ " <td>...</td>\n",
1142
+ " <td>...</td>\n",
1143
+ " <td>...</td>\n",
1144
+ " <td>...</td>\n",
1145
+ " <td>...</td>\n",
1146
+ " <td>...</td>\n",
1147
+ " <td>...</td>\n",
1148
+ " <td>...</td>\n",
1149
+ " <td>...</td>\n",
1150
+ " <td>...</td>\n",
1151
+ " <td>...</td>\n",
1152
+ " <td>...</td>\n",
1153
+ " <td>...</td>\n",
1154
+ " <td>...</td>\n",
1155
+ " <td>...</td>\n",
1156
+ " <td>...</td>\n",
1157
+ " <td>...</td>\n",
1158
+ " <td>...</td>\n",
1159
+ " <td>...</td>\n",
1160
+ " <td>...</td>\n",
1161
+ " <td>...</td>\n",
1162
+ " </tr>\n",
1163
+ " <tr>\n",
1164
+ " <th>2625</th>\n",
1165
+ " <td>0.241866</td>\n",
1166
+ " <td>0.242198</td>\n",
1167
+ " <td>0.217510</td>\n",
1168
+ " <td>0.000000</td>\n",
1169
+ " <td>0.178126</td>\n",
1170
+ " <td>0.0</td>\n",
1171
+ " <td>0.343441</td>\n",
1172
+ " <td>0.0</td>\n",
1173
+ " <td>0.0</td>\n",
1174
+ " <td>0.270278</td>\n",
1175
+ " <td>...</td>\n",
1176
+ " <td>0.0</td>\n",
1177
+ " <td>0.0</td>\n",
1178
+ " <td>0.0</td>\n",
1179
+ " <td>0.523597</td>\n",
1180
+ " <td>0.0</td>\n",
1181
+ " <td>1.000000</td>\n",
1182
+ " <td>0.0</td>\n",
1183
+ " <td>0.0</td>\n",
1184
+ " <td>0.186761</td>\n",
1185
+ " <td>0.0</td>\n",
1186
+ " </tr>\n",
1187
+ " <tr>\n",
1188
+ " <th>2626</th>\n",
1189
+ " <td>0.000000</td>\n",
1190
+ " <td>0.000000</td>\n",
1191
+ " <td>0.000000</td>\n",
1192
+ " <td>0.000000</td>\n",
1193
+ " <td>0.000000</td>\n",
1194
+ " <td>0.0</td>\n",
1195
+ " <td>0.000000</td>\n",
1196
+ " <td>0.0</td>\n",
1197
+ " <td>0.0</td>\n",
1198
+ " <td>0.000000</td>\n",
1199
+ " <td>...</td>\n",
1200
+ " <td>0.0</td>\n",
1201
+ " <td>0.0</td>\n",
1202
+ " <td>0.0</td>\n",
1203
+ " <td>0.000000</td>\n",
1204
+ " <td>0.0</td>\n",
1205
+ " <td>0.000000</td>\n",
1206
+ " <td>0.0</td>\n",
1207
+ " <td>0.0</td>\n",
1208
+ " <td>0.000000</td>\n",
1209
+ " <td>0.0</td>\n",
1210
+ " </tr>\n",
1211
+ " <tr>\n",
1212
+ " <th>2627</th>\n",
1213
+ " <td>0.000000</td>\n",
1214
+ " <td>0.000000</td>\n",
1215
+ " <td>0.000000</td>\n",
1216
+ " <td>0.000000</td>\n",
1217
+ " <td>0.000000</td>\n",
1218
+ " <td>0.0</td>\n",
1219
+ " <td>0.000000</td>\n",
1220
+ " <td>0.0</td>\n",
1221
+ " <td>0.0</td>\n",
1222
+ " <td>0.000000</td>\n",
1223
+ " <td>...</td>\n",
1224
+ " <td>0.0</td>\n",
1225
+ " <td>0.0</td>\n",
1226
+ " <td>0.0</td>\n",
1227
+ " <td>0.000000</td>\n",
1228
+ " <td>0.0</td>\n",
1229
+ " <td>0.000000</td>\n",
1230
+ " <td>0.0</td>\n",
1231
+ " <td>0.0</td>\n",
1232
+ " <td>0.000000</td>\n",
1233
+ " <td>0.0</td>\n",
1234
+ " </tr>\n",
1235
+ " <tr>\n",
1236
+ " <th>2628</th>\n",
1237
+ " <td>0.105519</td>\n",
1238
+ " <td>0.156851</td>\n",
1239
+ " <td>0.123984</td>\n",
1240
+ " <td>0.039841</td>\n",
1241
+ " <td>0.087996</td>\n",
1242
+ " <td>0.0</td>\n",
1243
+ " <td>0.164395</td>\n",
1244
+ " <td>0.0</td>\n",
1245
+ " <td>0.0</td>\n",
1246
+ " <td>0.097106</td>\n",
1247
+ " <td>...</td>\n",
1248
+ " <td>0.0</td>\n",
1249
+ " <td>0.0</td>\n",
1250
+ " <td>0.0</td>\n",
1251
+ " <td>0.082406</td>\n",
1252
+ " <td>0.0</td>\n",
1253
+ " <td>0.186761</td>\n",
1254
+ " <td>0.0</td>\n",
1255
+ " <td>0.0</td>\n",
1256
+ " <td>1.000000</td>\n",
1257
+ " <td>0.0</td>\n",
1258
+ " </tr>\n",
1259
+ " <tr>\n",
1260
+ " <th>2629</th>\n",
1261
+ " <td>0.000000</td>\n",
1262
+ " <td>0.000000</td>\n",
1263
+ " <td>0.000000</td>\n",
1264
+ " <td>0.000000</td>\n",
1265
+ " <td>0.000000</td>\n",
1266
+ " <td>0.0</td>\n",
1267
+ " <td>0.000000</td>\n",
1268
+ " <td>0.0</td>\n",
1269
+ " <td>0.0</td>\n",
1270
+ " <td>0.000000</td>\n",
1271
+ " <td>...</td>\n",
1272
+ " <td>0.0</td>\n",
1273
+ " <td>0.0</td>\n",
1274
+ " <td>0.0</td>\n",
1275
+ " <td>0.000000</td>\n",
1276
+ " <td>0.0</td>\n",
1277
+ " <td>0.000000</td>\n",
1278
+ " <td>0.0</td>\n",
1279
+ " <td>0.0</td>\n",
1280
+ " <td>0.000000</td>\n",
1281
+ " <td>0.0</td>\n",
1282
+ " </tr>\n",
1283
+ " </tbody>\n",
1284
+ "</table>\n",
1285
+ "<p>2630 rows × 2630 columns</p>\n",
1286
+ "</div>"
1287
+ ],
1288
+ "text/plain": [
1289
+ "name Bivalirudin Leuprolide Goserelin Gramicidin D Desmopressin \\\n",
1290
+ "0 1.000000 0.306150 0.288741 0.083090 0.310237 \n",
1291
+ "1 0.306150 1.000000 0.780471 0.087487 0.587482 \n",
1292
+ "2 0.288741 0.780471 1.000000 0.110648 0.603694 \n",
1293
+ "3 0.083090 0.087487 0.110648 1.000000 0.028047 \n",
1294
+ "4 0.310237 0.587482 0.603694 0.028047 1.000000 \n",
1295
+ "... ... ... ... ... ... \n",
1296
+ "2625 0.241866 0.242198 0.217510 0.000000 0.178126 \n",
1297
+ "2626 0.000000 0.000000 0.000000 0.000000 0.000000 \n",
1298
+ "2627 0.000000 0.000000 0.000000 0.000000 0.000000 \n",
1299
+ "2628 0.105519 0.156851 0.123984 0.039841 0.087996 \n",
1300
+ "2629 0.000000 0.000000 0.000000 0.000000 0.000000 \n",
1301
+ "\n",
1302
+ "name Cetrorelix Daptomycin Abarelix Pyridoxal phosphate Cyanocobalamin \\\n",
1303
+ "0 0.0 0.338457 0.0 0.0 0.430352 \n",
1304
+ "1 0.0 0.645092 0.0 0.0 0.601979 \n",
1305
+ "2 0.0 0.614604 0.0 0.0 0.608478 \n",
1306
+ "3 0.0 0.190443 0.0 0.0 0.032498 \n",
1307
+ "4 0.0 0.563808 0.0 0.0 0.630617 \n",
1308
+ "... ... ... ... ... ... \n",
1309
+ "2625 0.0 0.343441 0.0 0.0 0.270278 \n",
1310
+ "2626 0.0 0.000000 0.0 0.0 0.000000 \n",
1311
+ "2627 0.0 0.000000 0.0 0.0 0.000000 \n",
1312
+ "2628 0.0 0.164395 0.0 0.0 0.097106 \n",
1313
+ "2629 0.0 0.000000 0.0 0.0 0.000000 \n",
1314
+ "\n",
1315
+ "name ... Naphthoquine Odevixibat Melphalan flufenamide Deucravacitinib \\\n",
1316
+ "0 ... 0.0 0.0 0.0 0.305532 \n",
1317
+ "1 ... 0.0 0.0 0.0 0.167036 \n",
1318
+ "2 ... 0.0 0.0 0.0 0.123966 \n",
1319
+ "3 ... 0.0 0.0 0.0 0.000000 \n",
1320
+ "4 ... 0.0 0.0 0.0 0.099909 \n",
1321
+ "... ... ... ... ... ... \n",
1322
+ "2625 ... 0.0 0.0 0.0 0.523597 \n",
1323
+ "2626 ... 0.0 0.0 0.0 0.000000 \n",
1324
+ "2627 ... 0.0 0.0 0.0 0.000000 \n",
1325
+ "2628 ... 0.0 0.0 0.0 0.082406 \n",
1326
+ "2629 ... 0.0 0.0 0.0 0.000000 \n",
1327
+ "\n",
1328
+ "name Tegoprazan Belumosudil Tebipenem pivoxil Tosufloxacin Linzagolix \\\n",
1329
+ "0 0.0 0.241866 0.0 0.0 0.105519 \n",
1330
+ "1 0.0 0.242198 0.0 0.0 0.156851 \n",
1331
+ "2 0.0 0.217510 0.0 0.0 0.123984 \n",
1332
+ "3 0.0 0.000000 0.0 0.0 0.039841 \n",
1333
+ "4 0.0 0.178126 0.0 0.0 0.087996 \n",
1334
+ "... ... ... ... ... ... \n",
1335
+ "2625 0.0 1.000000 0.0 0.0 0.186761 \n",
1336
+ "2626 0.0 0.000000 0.0 0.0 0.000000 \n",
1337
+ "2627 0.0 0.000000 0.0 0.0 0.000000 \n",
1338
+ "2628 0.0 0.186761 0.0 0.0 1.000000 \n",
1339
+ "2629 0.0 0.000000 0.0 0.0 0.000000 \n",
1340
+ "\n",
1341
+ "name Methionine C-11 \n",
1342
+ "0 0.0 \n",
1343
+ "1 0.0 \n",
1344
+ "2 0.0 \n",
1345
+ "3 0.0 \n",
1346
+ "4 0.0 \n",
1347
+ "... ... \n",
1348
+ "2625 0.0 \n",
1349
+ "2626 0.0 \n",
1350
+ "2627 0.0 \n",
1351
+ "2628 0.0 \n",
1352
+ "2629 0.0 \n",
1353
+ "\n",
1354
+ "[2630 rows x 2630 columns]"
1355
+ ]
1356
+ },
1357
+ "execution_count": 9,
1358
+ "metadata": {},
1359
+ "output_type": "execute_result"
1360
+ }
1361
+ ],
1362
+ "source": [
1363
+ "df_predicted = pd.DataFrame.sparse.from_spmatrix(links)\n",
1364
+ "df_predicted.columns = df_matrix.columns\n",
1365
+ "df_predicted"
1366
+ ]
1367
+ },
1368
+ {
1369
+ "cell_type": "code",
1370
+ "execution_count": 10,
1371
+ "metadata": {},
1372
+ "outputs": [],
1373
+ "source": [
1374
+ "predictions = get_pairs_from_adjacency_matrix(df_predicted, excluded_pairs)"
1375
+ ]
1376
+ },
1377
+ {
1378
+ "cell_type": "code",
1379
+ "execution_count": 11,
1380
+ "metadata": {},
1381
+ "outputs": [
1382
+ {
1383
+ "data": {
1384
+ "text/plain": [
1385
+ "130972.25693560754"
1386
+ ]
1387
+ },
1388
+ "execution_count": 11,
1389
+ "metadata": {},
1390
+ "output_type": "execute_result"
1391
+ }
1392
+ ],
1393
+ "source": [
1394
+ "predictions.sum()"
1395
+ ]
1396
+ },
1397
+ {
1398
+ "cell_type": "code",
1399
+ "execution_count": 12,
1400
+ "metadata": {},
1401
+ "outputs": [
1402
+ {
1403
+ "data": {
1404
+ "text/plain": [
1405
+ "0.9689347313012144"
1406
+ ]
1407
+ },
1408
+ "execution_count": 12,
1409
+ "metadata": {},
1410
+ "output_type": "execute_result"
1411
+ }
1412
+ ],
1413
+ "source": [
1414
+ "import numpy as np\n",
1415
+ "from sklearn import metrics\n",
1416
+ "metrics.roc_auc_score(excluded_pair_values, predictions)"
1417
+ ]
1418
+ },
1419
+ {
1420
+ "cell_type": "code",
1421
+ "execution_count": 13,
1422
+ "metadata": {},
1423
+ "outputs": [
1424
+ {
1425
+ "data": {
1426
+ "text/plain": [
1427
+ "<sklearn.metrics._plot.roc_curve.RocCurveDisplay at 0x1f500c9a550>"
1428
+ ]
1429
+ },
1430
+ "execution_count": 13,
1431
+ "metadata": {},
1432
+ "output_type": "execute_result"
1433
+ },
1434
+ {
1435
+ "data": {
1436
+ "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYIAAAEGCAYAAABo25JHAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuNCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8QVMy6AAAACXBIWXMAAAsTAAALEwEAmpwYAAAw1UlEQVR4nO3dd5hV5bn38e89jaF3EClSBA1gRETsBo1dbEePwUQ8dn0tpLzxhPeYGKMmmpgTE2OLGo4dYomKJaKeiFiiAjrgAApIkSq9M3Xf7x9rzbin7jXDrBlm9u9zXXPNXm2ve+2Bde+nrOcxd0dERNJXRlMHICIiTUuJQEQkzSkRiIikOSUCEZE0p0QgIpLmspo6gLrq1q2b9+/fv6nDEBFpVmbPnr3B3btXt63ZJYL+/fsza9aspg5DRKRZMbPlNW1T1ZCISJpTIhARSXNKBCIiaU6JQEQkzSkRiIikudgSgZlNMrN1ZpZfw3Yzs3vMbLGZzTWzkXHFIiIiNYuzRPAocGot208DBoc/VwEPxBiLiIjUILbnCNx9hpn1r2WXs4HHPRgH+0Mz62Rmvdx9TVwxiewpd6eoNEFRSYLCsp/iUopLneLSBEWlCYpLEpQkgv1KSp3SRILiUqckESy7gxP8TiS9dnccwvVe/pvkdUmvG+Na635MHfev8xnqfo7gPHU7qH7nqOsBdT/JqP5dOG5Itc+E7ZGmfKCsN7AiaXlluK5KIjCzqwhKDfTr169RgpOWoagkwbaCYrbtLmbr7mK27C5me0EJOwpK2FFYzI7CUnYWlrCrqIQdhaXsKChmZ1GwbndxKQVFpRSUBDf+opLgRi/SUMzqtv813xnU4hJBdR9BtSnS3R8CHgIYNWqUZtJJI+7O7uJStuwqZvOuIrYXlLCzsIRtBcVs3lnMll1FbN5VXH6z31FYwtbdxWzeFdz4i0pS37jb5GTStlUWbXMyaZ+bTZucTHp2yKVNTiatszNplZ1BTmYmOVkZwU+mkZudSatwuVVWsC07M4OsTCMnM4OsDCM7K4PsjGBddqaRlZFBZoZhBmZGhoFRtvzN6wwzjOA34baydWWvoe43EQjOUaf963WOOu5fj5PUI6w6X0t94mqumjIRrAT6Ji33AVY3USzSiIpKEqzespt12wvZsKOQddsKWL+jkA3bi8KbeBGbdxWxZVfwDb62m7kZdGydTYfcbDq0zqJDbjYDurVlZJscOrbOpn1uFu1zs4N9WmfRsXUOHVtn0bZVFu1aZdE2J4uMjPT5Dy9SnaZMBFOB681sCnA4sFXtAy1DacJZvWU3SzfsZMXmXazYtJvlG3eyastuVm/ZzcadRVWqRzMzjC5tc+jcJptOrXMY2K0dndoEN/DObYObeuc22bTPzaZtqyw65GbRKbzZZ+pGLrJHYksEZjYZGAN0M7OVwC+BbAB3fxB4DTgdWAzsAi6NKxaJR0FxKfPXbGPR19tZsn4nX67fybKNO/lq064K3+KzM42+XdrQu1NrhvbqQM8OufTt0oYe7VvRPfzp0iZH38xFmkicvYYuTLHdgeviOr80nETCWbVlN4vX7eDztdv5Yu02FqzZzuL1OyhNBF/tczIz2K9rGwZ2a8sJB/ZgYLe2DOjWlr5d2tCzQ66+tYvsxZrdMNQSr627ilmwdhsLv97OF2u3s2DNNj5fu51dRaXl+/TqmMuB+7TnpKE9Gd67I0N7daB359a62Ys0U0oEaWzdtgLyV28lf9U2FqzZxpwVW1i9taB8e4fcLA7s1YELRvVlSM/2DO7ZjiE929OxdXYTRi0iDU2JIE1s3lnEZ6u2MnflFvJWBL/XbS8s396/axsO7d+Fi/ftwAH7tOfAfdqzT4fctOpCJ5KulAhaoNKE8+X6HeSt2MLsZZv5eNkmlm7YWb59YPe2HL1/Nw7q3TGo2tm3A+1a6Z+CSLrS//4WYMOOQmYu3cSnK7Ywd+UW8ldtY0dhCRD0sR+1X2fGHdaXYft25KA+HVW1IyIVKBE0Q2u27ubDJRuZtWwz7y7awFebdgGQk5XBt3p14NxDejOibycO7tuRgd3aqVumiNQqUiIwswzgYGBfYDcwz92/jjMw+cbmnUV88OVG3v9yA+8uWs+KTbsBaNcqiyMGduGiI/px6H6dOah3J3KyNMWEiNRNrYnAzAYBPwNOBBYB64FcYIiZ7QL+Ajzm7hqJqwEVlpQye9lm3l28gXe+WM/na7eR8LIbf1cuOWoAhw/owrd6dVCXTRHZY6lKBLcTzBNwtVcak9bMegDfB8YDj8UTXvpYt72AafO+Zvrn6/jgy43sLi4lM8M4rH9nbjhhMMcN6c63+3QkO1Pf+EWkYdWaCGp7Otjd1wF/bOiA0smGHYW8Me9rps5ZxUdLN+EO+3Vtw/mH9uE7Q7pz+MAutM9Vw66IxKvejcVmdpK7v9mQwaSD7QXFvDp3Da/MXcMHX24g4UF3zgknDOa0g/bhwH06NHWIIpJm9qTX0F8BzRITgbvz4ZJNPPXRct5a8DUFxQn269qGa8fsz+kH9eJbvdrrwS0RaTKpGoun1rQJ6Nrw4bQsm3YW8fRHy3l29kqWb9xFpzbZnH9oH/5tZB8O6dtJN38R2SukKhEcC1wE7Ki03oDRsUTUAixet4O/vreEFz9dze7iUo4Y2IUJJwzm9IN60Tons6nDExGpIFUi+BDY5e7vVN5gZl/EE1LztWbrbn73+he8mLeK7MwMzh3RmyuOHcDgnu2bOjQRkRql6jV0Wi3bjmv4cJqn4tIEk95byr3/XExhaYKrjxvEFccOoFu7Vk0dmohIShpiYg8tXredH/0tj/xV2zj+gO786qzh9OvapqnDEhGJTImgntydx/+1nF+/toB2rbJ48KKRnDq8V1OHJSJSZ0oE9VBQXMrE5+fyYt5qTjiwB3eedxA92uc2dVgiIvWiRFBH2wqKufKxWXy0dBM/OWkI1x+/v0b3FJFmLfLANWZ2S23L6WDt1gIuePBfzF6+mT+NG8GE7w5WEhCRZq8uJYLZKZZbtFVbdnPhQx+ycUch/3PpYRw7uHtThyQi0iAiJwJ3f7m25ZZs3bYCLnrkIzbvKuLJKw7nkH6dmzokEZEGk2qIiT8DXtN2d5/Q4BHtZbbuLuaS/5nJ19sKePyy0UoCItLipCoRzGqUKPZSJaUJrnp8FovWbeeR/ziMUf27NHVIIiINLtWTxRUmnDGztu6+M96Q9h6/f2MhHy3dxH//+8F8Z4jaBESkZYrUa8jMjjSz+cCCcPlgM7s/1sia2IyF6/nLjC8Zd1hfzju0T1OHIyISm6jdR/8InAJsBHD3OUCLHWtoZ2EJP3t+LoO6t+OXZw5r6nBERGIV+TkCd19RaVVpA8ey17jjHwtYu62A3553kIaNFpEWL2r30RVmdhTgZpYDTCCsJmpp8ldt5ckPv+Kyowdw6H5qHBaRli9qieAa4DqgN7AKGBEutzi/ff1zurTN4YcnDm7qUEREGkWkRODuG9z9B+7e0927u/tF7r4x1XFmdqqZfWFmi81sYjXbO5rZy2Y2x8zmmdml9bmIhvLuovW8u2gD144ZRMfW2U0ZiohIo4naa2hgeMNeb2brzOwlMxuY4phM4D7gNGAocKGZDa2023XAfHc/GBgD/HdY9dTo3J0/vrWIfTvmMv7I/ZoiBBGRJhG1auhp4BmgF7Av8CwwOcUxo4HF7r7E3YuAKcDZlfZxoL0Fs7i3AzYBJRFjalAzl21m9vLN/MdR/WmVpQZiEUkfUROBufsT7l4S/jxJLUNPhHoDyT2NVobrkt0LfAtYDXwG/NDdE1VObnaVmc0ys1nr16+PGHLdPP6vZXTIzVJpQETSTq2JwMy6mFkX4G0zm2hm/c1sPzP7T+DVFO9d3fjMlZPHKUAeQSljBHCvmXWocpD7Q+4+yt1Hde/e8E/4rt1awOv5azn/0L60ydEUDSKSXlLd9WYT3LzLbupXJ21z4LZajl0J9E1a7kPwzT/ZpcCd7u7AYjNbChwIfJwirgb17KwVlCSc/zhKpQERST+pxhoasAfvPRMYbGYDCLqcjgO+X2mfr4DvAu+aWU/gAGDJHpyzztydF/NWcfiALuzXtW1jnlpEZK8QuR7EzIYT9P4pn5zX3R+vaX93LzGz64FpQCYwyd3nmdk14fYHCUoUj5rZZwSljp+5+4Z6XUk9zV+zjS/X7+SyY/Yk54mINF+REoGZ/ZKge+dQ4DWCLqHvATUmAgB3fy3cP3ndg0mvVwMn1yniBvbPBesAOHnoPk0ZhohIk4naa+h8giqcte5+KXAw0Cq2qBrRjEXrOah3R7q3bxGXIyJSZ1ETwe6wW2dJ2KtnHVDrA2XNwbaCYj75agvHDu7W1KGIiDSZqG0Es8ysE/AwQU+iHTRyz544zF62mdKEc8z+SgQikr4iJQJ3vzZ8+aCZvQ50cPe58YXVOD5auomsDNM8xCKS1lJNXj+ytm3u/knDh9R4Zi3bxPDeHTXngIiktVQlgv+uZZsDJzRgLI2qoLiUuSu3cunR/Zs6FBGRJpXqgbLjGyuQxrZgzTaKShMc0q9TU4ciItKkIk9V2dIsWLMdgGH7dmziSEREmlbaJoKFX2+nTU4mvTu1bupQRESaVNomgi/Wbmdwz/ZkZFQ3SKqISPqIOkOZmdlFZnZzuNzPzEbHG1q8Fq3bwZAe7Zo6DBGRJhe1RHA/cCRwYbi8nWAaymZpW0ExG3YUMkiJQEQk8pPFh7v7SDP7FMDdNzfV3MINYen6nQAM6KZhp0VEopYIisPJ6B3AzLoDVaaUbC6Wb9oFQH/NPyAiEjkR3AO8APQws18TDEH9m9iiitmqzbsB6N1ZPYZERKKONfSUmc0mGIragHPcfUGskcVo9ZbddGydTbtWmp9YRCTqxDR/Av7m7s22gTjZ6i276dUxN/WOIiJpIGrV0CfAz81ssZndZWaj4gwqbqu3FrCvHiQTEQEiJgJ3f8zdTwdGAwuB35rZolgji9GarbvZt5NKBCIiUPcni/cHDgT6A583eDSNoLCklC27iunZXolARASiP1lcVgK4FZgHHOruZ8YaWUw27ywGoEu7ZvsYhIhIg4rabWYpcKS7b4gzmMawaWcRAJ3bKBGIiEDqGcoOdPfPCeYn7mdm/ZK3N8cZyrbsUiIQEUmWqkTwE+Aqqp+prFnOULZld1A11LltdhNHIiKyd0g1Q9lV4cvT3L0geZuZNcvW1i27gkTQsbUSgYgIRO819EHEdXu9rWGJoFNrVQ2JiEDqNoJ9gN5AazM7hGB4CYAOQJuYY4vFtoJisjON3Oy0nZNHRKSCVG0EpwCXAH2APySt3w78V0wxxWp7QTHtc7Mx08xkIiKQuo3gMeAxMzvP3Z9vpJhitb2gRIPNiYgkSVU1dJG7Pwn0N7OfVN7u7n+o5rC92g4lAhGRClJVlJfN3NIOaF/NT63M7FQz+yIcrG5iDfuMMbM8M5tnZu/UIfZ62VmkRCAikixV1dBfwt+/qusbhzOa3QecBKwEZprZVHefn7RPJ4L5kE9196/MrEddz1NXu4sT6joqIpIk6lhDvzOzDmaWbWb/a2YbzOyiFIeNBha7+xJ3LwKmAGdX2uf7wN/d/SsAd19X1wuoq8LiUnKz1GNIRKRM1Dviye6+DRhL8O1+CHBjimN6AyuSlleG65INATqb2XQzm21mF1f3RmZ2lZnNMrNZ69evjxhy9XYXl5KbnblH7yEi0pJETQRldSmnA5PdfVOEY6rrn+mVlrOAQ4EzCLqq/sLMhlQ5yP0hdx/l7qO6d+8eMeTqFRSX0iZHiUBEpEzUVtOXzexzYDdwrZl1BwpSHLMS6Ju03AdYXc0+G9x9J7DTzGYABxNMfhOLXUUqEYiIJIs6Q9lE4EhglLsXAzupWt9f2UxgsJkNMLMcYBwwtdI+LwHHmlmWmbUBDgcW1OUC6qqguJTWKhGIiJSLOnl9NjAeOC58Ivcd4MHajnH3EjO7HpgGZAKT3H2emV0Tbn/Q3ReY2evAXCABPOLu+fW+mhQSCae41GmlxmIRkXJRq4YeIGgnuD9cHh+uu6K2g9z9NeC1SuserLR8F3BXxDj2SFFpAoBWWSoRiIiUiZoIDnP3g5OW/2lmc+IIKE6FxWWJQCUCEZEyUe+IpWY2qGzBzAYCpfGEFJ/CkiDkHCUCEZFyUUsENwJvm9kSgm6h+wGXxhZVTApLVCIQEaksZSIIu4puJXhSuAdBIvjc3Qtjjq3BlSUClQhERL5R6x3RzK4A5gF/BvKA/u4+pzkmAfimakiNxSIi30hVIvgRMMzd14ftAk9R9VmAZqOovESgSWlERMqkqiMpcvf1AO6+BGgVf0jxKS4NRrjIyVSJQESkTKoSQR8zu6emZXefEE9Y8SgJnyPIzlSJQESkTKpEUHmE0dlxBdIYyh4oy8pUY7GISJkocxa3GCXlVUNKBCIiZVL1GnrIzIbXsK2tmV1mZj+IJ7SGV1xeIlDVkIhImVRVQ/cDN5vZQUA+sB7IBQYDHYBJBD2JmoXiRFAiUBuBiMg3UlUN5QEXmFk7YBTQi2BOggXu/kX84TWs0kRYIshQ1ZCISJlIQ0y4+w5geryhxK+sjSAzQyUCEZEyafXVuDSsGlIbgYjIN9IqEZS1EahEICLyjTolAjNrG1cgjaG07IEytRGIiJSLdEc0s6PMbD7hfMJmdrCZ3Z/isL1OSVgiyFCJQESkXNSvxncDpwAbAdx9DnBcXEHFJeGqGhIRqSxyHYm7r6i0qtnNUBYWCFAeEBH5RtQZylaY2VGAm1kOMIGwmqg5Kes1lGHKBCIiZaKWCK4BrgN6AyuBEcC1McUUm4R6DYmIVBG1RHCAu1cYU8jMjgbeb/iQ4lNa1kagEoGISLmoJYI/R1y3VytvI1CJQESkXK0lAjM7EjgK6G5mP0na1AFodtN8JRKuhmIRkUpSVQ3lAO3C/donrd8GnB9XUHEpdVf7gIhIJalGH30HeMfMHnX35Y0UU2yCEoESgYhIsqiNxbvM7C5gGMF8BAC4+wmxRBWT0oRKBCIilUVtLH4K+BwYAPwKWAbMjCmm2JS6q8eQiEglURNBV3f/K1Ds7u+4+2XAETHGFQt39RgSEaksatVQcfh7jZmdAawG+sQTUnxK1WtIRKSKqCWC282sI/B/gZ8CjwA/SnWQmZ1qZl+Y2WIzm1jLfoeZWamZxdoTKeFqLBYRqSzqVJWvhC+3AsdD+ZPFNTKzTOA+4CSCYSlmmtlUd59fzX6/BabVLfS6S6hqSESkilpLBGaWaWYXmtlPzWx4uG6smX0A3JvivUcDi919ibsXAVOAs6vZ7wbgeWBd3cOvGz1QJiJSVaoSwV+BvsDHwD1mthw4Epjo7i+mOLY3kDx09Urg8OQdzKw3cC5wAnBYTW9kZlcBVwH069cvxWlrllCvIRGRKlIlglHAt909YWa5wAZgf3dfG+G9q7vjeqXlPwI/c/dSq+UG7e4PAQ8BjBo1qvJ7RFbqTm3nERFJR6kSQZG7JwDcvcDMFkZMAhCUAPomLfch6G2UbBQwJbw5dwNON7OSCKWNenHXENQiIpWlSgQHmtnc8LUBg8JlA9zdv13LsTOBwWY2AFgFjAO+n7yDuw8oe21mjwKvxJUEQN1HRUSqkyoRfKu+b+zuJWZ2PUFvoExgkrvPM7Nrwu0P1ve960vdR0VEqko16NweDTTn7q8Br1VaV20CcPdL9uRcUSQ0+qiISBWRJ69vCRIJzVcsIlJZeiUCd5QHREQqipwIzKy1mR0QZzBxS7hKBCIilUVKBGZ2JpAHvB4ujzCzqTHGFYuEOxlpVQYSEUkt6m3xFoIhI7YAuHse0D+OgOKkJ4tFRKqKmghK3H1rrJE0goSjJ4tFRCqJOh9Bvpl9H8g0s8HABOCD+MKKh7seKBMRqSxqieAGgvmKC4GnCYaj/lFMMcVGD5SJiFQVtURwgLvfBNwUZzBx03MEIiJVRS0R/MHMPjez28xsWKwRxUjPEYiIVBUpEbj78cAYYD3wkJl9ZmY/jzOwOKhqSESkqsi96t19rbvfA1xD8EzBzXEFFZdgqsqmjkJEZO8S9YGyb5nZLWaWTzBF5QcE8ws0K+6OVTtfjohI+oraWPw/wGTgZHevPLlMs6HJ60VEqoqUCNz9iLgDaQxBiUBERJLVmgjM7Bl3v8DMPqPifMNRZijb6zjogTIRkUpSlQh+GP4eG3cgjSGhyetFRKqotbHY3deEL6919+XJP8C18YfXsNxVIhARqSxqZ8qTqll3WkMG0hgSDqiVQESkglRtBP+H4Jv/QDObm7SpPfB+nIHFQYPOiYhUlaqN4GngH8AdwMSk9dvdfVNsUcXENUOZiEgVqRKBu/syM7uu8gYz69LckoHGGhIRqSpKiWAsMJug92XybdSBgTHFFYug+6gygYhIsloTgbuPDX8PaJxw4pVwV1uxiEglUccaOtrM2oavLzKzP5hZv3hDi4HaCEREqojaffQBYJeZHQz8J7AceCK2qGKS0BATIiJV1GXyegfOBv7k7n8i6ELarCT0QJmISBVRRx/dbmb/DxgPHGtmmUB2fGHFw9HENCIilUUtEXyPYOL6y9x9LdAbuCu2qGKSSKDGYhGRSqJOVbkWeAroaGZjgQJ3fzzWyGKiEoGISEVRew1dAHwM/DtwAfCRmZ0f4bhTzewLM1tsZhOr2f4DM5sb/nwQNkbHRo3FIiJVRW0juAk4zN3XAZhZd+At4LmaDgjbEe4jGLBuJTDTzKa6+/yk3ZYC33H3zWZ2GvAQcHjdLyMaDTEhIlJV1DaCjLIkENoY4djRwGJ3X+LuRcAUgl5H5dz9A3ffHC5+SMzzIGuICRGRqqKWCF43s2kE8xZD0Hj8WopjegMrkpZXUvu3/csJBrirwsyuAq4C6Nev/s+xefBe9T5eRKQlijpn8Y1m9m/AMQT9bh5y9xdSHFbdHderWYeZHU+QCI6p4fwPEVQbMWrUqGrfIwpXiUBEpIpU8xEMBn4PDAI+A37q7qsivvdKoG/Sch9gdTXn+DbwCHCau2+M+N71ogfKRESqSlXPPwl4BTiPYATSP9fhvWcCg81sgJnlAOOAqck7hOMV/R0Y7+4L6/De9RJMTKNMICKSLFXVUHt3fzh8/YWZfRL1jd29xMyuB6YBmcAkd59nZteE2x8Ebga6AveHdfcl7j6qrhcRVUKDj4qIVJEqEeSa2SF8c/9snbzs7rUmBnd/jUqNymECKHt9BXBFXYOur6CNQKlARCRZqkSwBvhD0vLapGUHTogjqLi4o8ZiEZFKUk1Mc3xjBdIYNEOZiEhVUR8oaxES7uo1JCJSSdolArURiIhUlFaJQG0EIiJVRR191MK5im8Ol/uZ2eh4Q2t4wdz1ygQiIsmilgjuB44ELgyXtxOMLNqsBDOUNXUUIiJ7l6iDzh3u7iPN7FOAcNjonBjjikVCVUMiIlVELREUh/MLOJTPR5CILaqYaIgJEZGqoiaCe4AXgB5m9mvgPeA3sUUVA3cPSwRKBCIiyaIOQ/2Umc0GvkswvMQ57r4g1sgamIeDV2cqEYiIVBApEYSjhO4CXk5e5+5fxRVYQ0uEmUCNxSIiFUVtLH6VcIIvIBcYAHwBDIsprgaXCEsEGcoEIiIVRK0aOih52cxGAlfHElFMykoEqhkSEamoXk8Wh8NPH9bAscSqrI1AvYZERCqK2kbwk6TFDGAksD6WiGKiNgIRkepFbSNon/S6hKDN4PmGDyc+3yQCZQIRkWQpE0H4IFk7d7+xEeKJTVljsZ4jEBGpqNY2AjPLcvdSgqqgZs1VNSQiUq1UJYKPCZJAnplNBZ4FdpZtdPe/xxhbgypNqGpIRKQ6UdsIugAbCeYoLnuewIFmkwjKnyNQHhARqSBVIugR9hjK55sEUMZjiyoG5VVDygSylyguLmblypUUFBQ0dSjSguTm5tKnTx+ys7MjH5MqEWQC7aDa2VyaVSIoSTSrcCUNrFy5kvbt29O/f391YpAG4e5s3LiRlStXMmDAgMjHpUoEa9z91j0Lbe9Qlga2F5Q0aRwiZQoKCpQEpEGZGV27dmX9+ro95pXqyeIW8y80EZYIurZtdvPpSAumJCANrT7/plIlgu/WL5S9T1mvoaxM/ccTEUlWayJw902NFUjcStR9VKSKtWvXMm7cOAYNGsTQoUM5/fTTWbhwIcuWLWP48OENdp6bb76Zt956C4B3332XYcOGMWLECFatWsX555+/R+/t7pxwwgls27atfN0LL7yAmfH555+Xr5s+fTpjx46tcOwll1zCc889BwSN9xMnTmTw4MEMHz6c0aNH849//GOPYgO444472H///TnggAOYNm1atfvMmTOHI488koMOOogzzzyz/FqeeuopRowYUf6TkZFBXl4eACeeeCKbN2/e4/ignoPONUdlQ0xkqteQCBDcQM8991zGjBnDl19+yfz58/nNb37D119/3eDnuvXWWznxxBOB4Ob205/+lLy8PHr37l1+I46itLS0yrrXXnuNgw8+mA4dOpSvmzx5MscccwxTpkyJ/N6/+MUvWLNmDfn5+eTn5/Pyyy+zffv2yMdXZ/78+UyZMoV58+bx+uuvc+2111Z7DVdccQV33nknn332Geeeey533XUXAD/4wQ/Iy8sjLy+PJ554gv79+zNixAgAxo8fz/33379H8ZWJ+hxBs1deNaREIHuhX708j/mrt6XesQ6G7tuBX55Z85Qhb7/9NtnZ2VxzzTXl68puMsuWLStft2zZMsaPH8/OncGzpPfeey9HHXUUa9as4Xvf+x7btm2jpKSEBx54gKOOOorLL7+cWbNmYWZcdtll/PjHP+aSSy5h7NixbNmyhWeeeYZp06bx1ltv8etf/5qxY8eSn59PaWkpEydOZPr06RQWFnLddddx9dVXM336dH71q1/Rq1cv8vLymD9/foXreOqpp7jqqqvKl3fs2MH777/P22+/zVlnncUtt9yS8rPatWsXDz/8MEuXLqVVq1YA9OzZkwsuuCDlsbV56aWXGDduHK1atWLAgAHsv//+fPzxxxx55JEV9vviiy847rjjADjppJM45ZRTuO222yrsM3nyZC688MLy5bPOOotjjz2Wm266aY9ihDRMBKoaEgnk5+dz6KGHptyvR48evPnmm+Tm5rJo0SIuvPBCZs2axdNPP80pp5zCTTfdRGlpKbt27SIvL49Vq1aRn58PwJYtWyq81xVXXMF7773H2LFjOf/88ysknL/+9a907NiRmTNnUlhYyNFHH83JJ58MwMcff0x+fn61XSLff/99/vKXv5Qvv/jii5x66qkMGTKELl268MknnzByZO2j5CxevJh+/fpVKFXU5Mc//jFvv/12lfXjxo1j4sSJFdatWrWKI444ony5T58+rFq1qsqxw4cPZ+rUqZx99tk8++yzrFixoso+f/vb33jppZfKlzt37kxhYSEbN26ka9euKeOuTdokAlUNyd6stm/uTa24uJjrr7+evLw8MjMzWbhwIQCHHXYYl112GcXFxZxzzjmMGDGCgQMHsmTJEm644QbOOOOM8ht5FG+88QZz584tryraunUrixYtIicnh9GjR9fYL37Tpk20b//NAMmTJ0/mRz/6ERDcnCdPnszIkSNr7E1T1142d999d+R9yx5kTXW+SZMmMWHCBG699VbOOusscnIq9m786KOPaNOmTZV2mx49erB69eq9OxGY2anAnwgeTHvE3e+stN3C7acTzIl8STjpTYMrLlUiEEk2bNiwSPXzd999Nz179mTOnDkkEglyc3MBOO6445gxYwavvvoq48eP58Ybb+Tiiy9mzpw5TJs2jfvuu49nnnmGSZMmRYrH3fnzn//MKaecUmH99OnTadu2bY3HZWVlkUgkyMjIYOPGjfzzn/8kPz8fM6O0tBQz43e/+x1du3at0ri6adMmunXrxv77789XX33F9u3bKySV6tSlRNCnT58K3+5XrlzJvvvuW+XYAw88kDfeeAOAhQsX8uqrr1bYPmXKlArVQmUKCgpo3bp1rfFGEVtjcTh89X3AacBQ4EIzG1ppt9OAweHPVcADccWzuyhooGnbKm0KQSK1OuGEEygsLOThhx8uXzdz5kzeeeedCvtt3bqVXr16kZGRwRNPPFHe2Ll8+XJ69OjBlVdeyeWXX84nn3zChg0bSCQSnHfeedx222188kn073WnnHIKDzzwAMXFxUBwQyxrl6jNAQccwJIlSwB47rnnuPjii1m+fDnLli1jxYoVDBgwgPfee4/BgwezevVqFixYUB7/nDlzGDFiBG3atOHyyy9nwoQJFBUVAbBmzRqefPLJKue7++67yxtwk38qJwEI6vGnTJlCYWEhS5cuZdGiRYwePbrKfuvWrQMgkUhw++23V2i3SSQSPPvss4wbN67CMe7O2rVr6d+/f8rPKJU4ew2NBha7+xJ3LwKmAGdX2uds4HEPfAh0MrNecQSzozB4orhNTmYcby/S7JgZL7zwAm+++SaDBg1i2LBh3HLLLVW+sV577bU89thjHHHEESxcuLD82/n06dMZMWIEhxxyCM8//zw//OEPWbVqFWPGjGHEiBFccskl3HHHHZHjueKKKxg6dCgjR45k+PDhXH311ZSUpB4J4IwzzmD69OlAUC107rnnVth+3nnn8fTTT9OqVSuefPJJLr30UkaMGMH555/PI488QseOHQG4/fbb6d69O0OHDmX48OGcc845dO/ePXL81Rk2bBgXXHABQ4cO5dRTT+W+++4jMzOz/HpnzZpVHveQIUM48MAD2Xfffbn00kvL32PGjBn06dOHgQMHVnjv2bNnc8QRR5CV1QBfbt09lh/gfILqoLLl8cC9lfZ5BTgmafl/gVHVvNdVwCxgVr9+/bw+Zi3b6Nc8McvXbNldr+NFGtr8+fObOoQWYfXq1X7iiSc2dRiNbsKECf7WW29Vu626f1vALK/hfh1nPUmUgeoiDWbn7g8BDwGMGjWqXqPHHbpfFw7dr0t9DhWRvVivXr248sor2bZtW6RePy3F8OHD+e53G2bwhzgTwUqgb9JyH2B1PfYREanVnvb3b46uvPLKBnuvONsIZgKDzWyAmeUA44CplfaZClxsgSOAre6+JsaYRPYqXk33QpE9UZ9/U7GVCNy9xMyuB6YRdB+d5O7zzOyacPuDwGsEXUcXE3QfvbSm9xNpaXJzc8sfBtIopNIQPJyPoKyLb1TW3L6RjBo1ysta2kWaM81QJnGoaYYyM5vt7qOqO0ad6kWaSHZ2dp1mkRKJS9qMPioiItVTIhARSXNKBCIiaa7ZNRab2XpgeT0P7wZsaMBwmgNdc3rQNaeHPbnm/dy92jEzml0i2BNmNqumVvOWStecHnTN6SGua1bVkIhImlMiEBFJc+mWCB5q6gCagK45Peia00Ms15xWbQQiIlJVupUIRESkEiUCEZE01yITgZmdamZfmNliM6sykWg47PU94fa5ZjayKeJsSBGu+Qfhtc41sw/M7OCmiLMhpbrmpP0OM7NSMzu/MeOLQ5RrNrMxZpZnZvPM7J3q9mlOIvzb7mhmL5vZnPCam/UoxmY2yczWmVl+Ddsb/v5V09RlzfWHYMjrL4GBQA4wBxhaaZ/TgX8QzJB2BPBRU8fdCNd8FNA5fH1aOlxz0n7/JBjy/PymjrsR/s6dgPlAv3C5R1PH3QjX/F/Ab8PX3YFNQE5Tx74H13wcMBLIr2F7g9+/WmKJYDSw2N2XuHsRMAU4u9I+ZwOPe+BDoJOZ9WrsQBtQymt29w/cfXO4+CHBbHDNWZS/M8ANwPPAusYMLiZRrvn7wN/d/SsAd2/u1x3lmh1ob8GkDu0IEkHqWe/3Uu4+g+AaatLg96+WmAh6AyuSlleG6+q6T3NS1+u5nOAbRXOW8prNrDdwLvBgI8YVpyh/5yFAZzObbmazzeziRosuHlGu+V7gWwTT3H4G/NDdE40TXpNo8PtXS5yPoLqpnir3kY2yT3MS+XrM7HiCRHBMrBHFL8o1/xH4mbuXtpAZwKJccxZwKPBdoDXwLzP70N0Xxh1cTKJc8ylAHnACMAh408zedfdtMcfWVBr8/tUSE8FKoG/Sch+Cbwp13ac5iXQ9ZvZt4BHgNHff2EixxSXKNY8CpoRJoBtwupmVuPuLjRJhw4v6b3uDu+8EdprZDOBgoLkmgijXfClwpwcV6IvNbClwIPBx44TY6Br8/tUSq4ZmAoPNbICZ5QDjgKmV9pkKXBy2vh8BbHX3NY0daANKec1m1g/4OzC+GX87TJbymt19gLv3d/f+wHPAtc04CUC0f9svAceaWZaZtQEOBxY0cpwNKco1f0VQAsLMegIHAEsaNcrG1eD3rxZXInD3EjO7HphG0ONgkrvPM7Nrwu0PEvQgOR1YDOwi+EbRbEW85puBrsD94TfkEm/GIzdGvOYWJco1u/sCM3sdmAskgEfcvdpuiM1BxL/zbcCjZvYZQbXJz9y92Q5PbWaTgTFANzNbCfwSyIb47l8aYkJEJM21xKohERGpAyUCEZE0p0QgIpLmlAhERNKcEoGISJpTIkgD4cibeUk//WvZd0cDnO9RM1sanusTMzuyHu/xiJkNDV//V6VtH+xpjOH7lH0u+eHolZ1S7D/CzE6vx3l6mdkr4esxZrbVzD41swVm9st6vN9ZZaNwmtk5ZZ9TuHyrmZ1Y1/es5hyPWorRWsNhLCJ3QQ6v/ZUI+1U7+qaZ/d7MToh6PolOiSA97Hb3EUk/yxrhnDe6+whgIvCXuh7s7le4+/xw8b8qbTtqz8MDvvlchhMM8nVdiv1HEPTfrqufAA8nLb/r7ocQPPl8kZkdWpc3c/ep7n5nuHgOMDRp283u/lY9YtybPAqcWs36PxP8e5IGpkSQhsysnZn9b/ht/TMzqzJqZ/gtdkbSN+Zjw/Unm9m/wmOfNbN2KU43A9g/PPYn4Xvlm9mPwnVtzexVC8aSzzez74Xrp5vZKDO7E2gdxvFUuG1H+Ptvyd/Qw2+x55lZppndZWYzLRiv/eoIH8u/CAfuMrPRFszZ8Gn4+4DwqdZbge+FsXwvjH1SeJ5Pq/scQ+cBr1deGQ4DMRsYFJY2PgzjfcHMOoexTDCz+eH6KeG6S8zsXjM7CjgLuCuMaVDZN3kzO83Mnkn6bMaY2cvh6zr9Dc3s5vAa883sIbMKAzddFH5G+WY2Otw/6udSrZpG33T35UBXM9unLu8nETTWGNv6abofoJRgUK484AWCJ8o7hNu6ETyhWPZw4Y7w9/8FbgpfZwLtw31nAG3D9T8Dbq7mfI8Sjv0P/DvwEcFAaJ8BbQmGCp4HHEJwk3w46diO4e/pwKjkmJL2KYvxXOCx8HUOwYiMrYGrgJ+H61sBs4AB1cS5I+n6ngVODZc7AFnh6xOB58PXlwD3Jh3/G+Ci8HUngvF82lY6xwBgdtLyGOCV8HVXYBkwjOBJ4O+E628F/hi+Xg20KjtH5TiSP+vk5fBv/FXS3+oB4KJ6/g27JK1/Ajgz6W/0cPj6OMLx82v6XCpd+yiCp55r+jfbn2rG4ycoWZ3X1P+nWtpPixtiQqq124NqGgDMLBv4jZkdRzAMQW+gJ7A26ZiZwKRw3xfdPc/MvkNQDfF++KUwh+CbdHXuMrOfA+sJRjv9LvCCB9+CMbO/A8cSfFP+vZn9luAm8W4drusfwD1m1oqgKmGGu+82s5OBbyfVcXcEBgNLKx3f2szyCG46s4E3k/Z/zMwGE4zqmF3D+U8GzjKzn4bLuUA/Ko7t0yv8DJIda2afEnz2dxIMItbJ3ctmE3uMIDFBkCCeMrMXgRdriKMKD4ZmeB0408yeA84A/hOoy9+wzPFm9p9AG6ALQRJ/Odw2OTzfDDPrYEE7S02fS3J8s4Arol5PknXAvvU4TmqhRJCefkAwk9Oh7l5sZssI/rOWC/9jH0dwA3nCzO4CNgNvuvuFEc5xo7s/V7ZgNTRguvvCsI78dOAOM3vD3W+NchHuXmBm0wmGIf4e4U2JYLyZG9x9Woq32O3uI8ysI/AKQRvBPQRj17zt7uda0LA+vYbjjeDb6Re1nYNKny1BG8HY8jcJzl+TMwi+bZ8F/MLMhtWyb2V/I7imTcBMd98eVutE/RtiZrnA/QSlsxVmdgsVr6fyGDVODZ+LBQPC7alcgs9UGpDaCNJTR2BdmASOB/arvIOZ7Rfu8zDwV4Kp8z4Ejjazsjr/NmY2JOI5ZwDnhMe0JajWedfM9gV2ufuTwO/D81RWHJZMqjOFYNCtYwkGJiP8/X/KjjGzIeE5q+XuW4EJwE/DYzoCq8LNlyTtup2giqzMNOCGsjpzMzukmrdfSFDiqFF4/s0WtsMA44F3zCwD6OvubxN8m+9EUK2WrHJMyaYTfJ5XEiQFqPvfsOymvyFsS6jck6isTecYglEwtxLtc6mvIUCzHURvb6VEkJ6eAkaZ2SyC0sHn1ewzBsgLqzDOA/7k7usJboyTzWwuwU3lwCgndPdPCOqdPyZoM3jE3T8FDgI+DqtobgJur+bwh4C5FjYWV/IGwTfmtzyYyhCCORfmA59Y0AXxL6Qo/YaxzCEY5vh3BKWT9wnaD8q8DQwtaywmKDlkh7Hlh8uV33cn8GXZjbcW/0FQnTaXoHfSreG5n7RgVM1PgbvdfUul46YAN4aNsoMqnbuUoKRzWvibuv4Nw/M9TNC+8yJBlWGyzRZ0532QoAoQInwuFnQEeKS6c1ow+ua/gAPMbKWZXR6uzyboeDCrpnilfjT6qEjMzOxcgmq4nzd1LM1Z+DmOdPdfNHUsLY3aCERi5u4vmFnXpo6jBcgC/rupg2iJVCIQEUlzaiMQEUlzSgQiImlOiUBEJM0pEYiIpDklAhGRNPf/AaHCUMohgRR1AAAAAElFTkSuQmCC",
1437
+ "text/plain": [
1438
+ "<Figure size 432x288 with 1 Axes>"
1439
+ ]
1440
+ },
1441
+ "metadata": {
1442
+ "needs_background": "light"
1443
+ },
1444
+ "output_type": "display_data"
1445
+ }
1446
+ ],
1447
+ "source": [
1448
+ "from sklearn.metrics import RocCurveDisplay\n",
1449
+ "RocCurveDisplay.from_predictions(excluded_pair_values, predictions)"
1450
+ ]
1451
+ },
1452
+ {
1453
+ "cell_type": "code",
1454
+ "execution_count": null,
1455
+ "metadata": {},
1456
+ "outputs": [],
1457
+ "source": []
1458
+ }
1459
+ ],
1460
+ "metadata": {
1461
+ "kernelspec": {
1462
+ "display_name": "base",
1463
+ "language": "python",
1464
+ "name": "python3"
1465
+ },
1466
+ "language_info": {
1467
+ "codemirror_mode": {
1468
+ "name": "ipython",
1469
+ "version": 3
1470
+ },
1471
+ "file_extension": ".py",
1472
+ "mimetype": "text/x-python",
1473
+ "name": "python",
1474
+ "nbconvert_exporter": "python",
1475
+ "pygments_lexer": "ipython3",
1476
+ "version": "3.8.8"
1477
+ },
1478
+ "orig_nbformat": 4
1479
+ },
1480
+ "nbformat": 4,
1481
+ "nbformat_minor": 2
1482
+ }
parse.ipynb ADDED
@@ -0,0 +1,698 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "attachments": {},
5
+ "cell_type": "markdown",
6
+ "metadata": {},
7
+ "source": [
8
+ "# Convert the DrugBank XML databse to JSON and extract features"
9
+ ]
10
+ },
11
+ {
12
+ "cell_type": "markdown",
13
+ "metadata": {},
14
+ "source": [
15
+ "Run using Python 3 to avoid a non-ascii character error when writing to file with the csv module."
16
+ ]
17
+ },
18
+ {
19
+ "cell_type": "code",
20
+ "execution_count": 120,
21
+ "metadata": {
22
+ "collapsed": true
23
+ },
24
+ "outputs": [],
25
+ "source": [
26
+ "import os\n",
27
+ "import csv\n",
28
+ "import gzip\n",
29
+ "import collections\n",
30
+ "import re\n",
31
+ "import io\n",
32
+ "import json\n",
33
+ "import xml.etree.ElementTree as ET\n",
34
+ "import requests\n",
35
+ "import pandas\n",
36
+ "import xmltodict\n",
37
+ "import json"
38
+ ]
39
+ },
40
+ {
41
+ "cell_type": "code",
42
+ "execution_count": 3,
43
+ "metadata": {},
44
+ "outputs": [],
45
+ "source": [
46
+ "xml_path = \"data/full_database.xml\"\n",
47
+ "json_path = \"data/full_database.json\""
48
+ ]
49
+ },
50
+ {
51
+ "cell_type": "code",
52
+ "execution_count": 2,
53
+ "metadata": {},
54
+ "outputs": [],
55
+ "source": [
56
+ "# Read the XML file\n",
57
+ "\n",
58
+ "with open('data/full_database.xml', encoding=\"UTF8\") as f:\n",
59
+ " db = xmltodict.parse(f.read())\n",
60
+ "\n",
61
+ "json_obj = json.dumps(db, indent=4)\n",
62
+ "\n",
63
+ "# output as json\n",
64
+ "with open(\"data/full_database.json\", \"w\") as outfile:\n",
65
+ " outfile.write(json_obj)"
66
+ ]
67
+ },
68
+ {
69
+ "cell_type": "code",
70
+ "execution_count": 173,
71
+ "metadata": {},
72
+ "outputs": [],
73
+ "source": [
74
+ "desired_props_exp = set([\"Water Solubility\",\n",
75
+ " \"Melting Point\",\n",
76
+ " \"Boiling Point\",\n",
77
+ " \"logP\",\n",
78
+ " \"logS\",\n",
79
+ " \"Hydrophobicity\",\n",
80
+ " \"Isoelectric Point\",\n",
81
+ " \"caco2 Permeability\",\n",
82
+ " \"pKa\",\n",
83
+ " \"Molecular Weight\",\n",
84
+ " \"Radioactivity\"])\n",
85
+ "\n",
86
+ "desired_props_calc = set([\"logP\",\n",
87
+ " \"logS\",\n",
88
+ " \"Water Solubility\",\n",
89
+ " \"Molecular Weight\",\n",
90
+ " \"Monoisotopic Weight\",\n",
91
+ " \"Polar Surface Area (PSA)\",\n",
92
+ " \"Refractivity\",\n",
93
+ " \"Polarizability\",\n",
94
+ " \"Rotatable Bond Count\",\n",
95
+ " \"H Bond Acceptor Count\",\n",
96
+ " \"H Bond Donor Count\",\n",
97
+ " \"pKa (strongest acidic)\",\n",
98
+ " \"pKa (strongest basic)\",\n",
99
+ " \"Physiological Charge\",\n",
100
+ " \"Number of Rings\",\n",
101
+ " \"Bioavailability\",\n",
102
+ " \"Rule of Five\",\n",
103
+ " \"Ghose Filter\",\n",
104
+ " \"MDDR-Like Rule\",\n",
105
+ " \"Veber's Rule\"])\n",
106
+ "\n",
107
+ "def getProperties(desired_props, props, row):\n",
108
+ " for prop in desired_props:\n",
109
+ " if prop not in row:\n",
110
+ " row[prop] = None\n",
111
+ "\n",
112
+ " try:\n",
113
+ " for prop in props:\n",
114
+ " if(prop['kind'] in desired_props):\n",
115
+ " match = re.search(r\"[-+]?[0-9]*\\.?[0-9]+([eE][-+]?[0-9]+)?\", prop['value'])\n",
116
+ " row[prop['kind']] = float(match.group(0))\n",
117
+ " except:\n",
118
+ " pass"
119
+ ]
120
+ },
121
+ {
122
+ "cell_type": "code",
123
+ "execution_count": 12,
124
+ "metadata": {},
125
+ "outputs": [],
126
+ "source": [
127
+ "with open(json_path) as f:\n",
128
+ " data = json.load(f)"
129
+ ]
130
+ },
131
+ {
132
+ "cell_type": "code",
133
+ "execution_count": 174,
134
+ "metadata": {},
135
+ "outputs": [],
136
+ "source": [
137
+ "rows = []\n",
138
+ "for i in range(15235):\n",
139
+ " row = {}\n",
140
+ " drug = data['drugbank']['drug'][i]\n",
141
+ " row['name'] = drug['name']\n",
142
+ " row['state'] = drug.get('state', None)\n",
143
+ " atc_code = None\n",
144
+ " try:\n",
145
+ " atc_code = drug.get('atc-codes', dict()).get('atc-code', None)\n",
146
+ " atc_code = atc_code[0]\n",
147
+ " except:\n",
148
+ " pass\n",
149
+ "\n",
150
+ " row['level4'] = None\n",
151
+ " row['level3'] = None\n",
152
+ " row['level2'] = None\n",
153
+ " row['level1'] = None\n",
154
+ " try:\n",
155
+ " row['level4'] = atc_code['level'][0]['@code']\n",
156
+ " row['level3'] = atc_code['level'][1]['@code']\n",
157
+ " row['level2'] = atc_code['level'][2]['@code']\n",
158
+ " row['level1'] = atc_code['level'][3]['@code']\n",
159
+ " except:\n",
160
+ " pass\n",
161
+ "\n",
162
+ " \n",
163
+ " try:\n",
164
+ " exp_props = drug['experimental-properties']['property']\n",
165
+ " except:\n",
166
+ " exp_props = None\n",
167
+ " getProperties(desired_props_exp, exp_props, row)\n",
168
+ "\n",
169
+ " try:\n",
170
+ " calc_props = drug['calculated-properties']['property']\n",
171
+ " except:\n",
172
+ " calc_props = None\n",
173
+ " getProperties(desired_props_calc, calc_props, row)\n",
174
+ "\n",
175
+ " rows.append(row)"
176
+ ]
177
+ },
178
+ {
179
+ "cell_type": "code",
180
+ "execution_count": 175,
181
+ "metadata": {},
182
+ "outputs": [
183
+ {
184
+ "data": {
185
+ "text/html": [
186
+ "<div>\n",
187
+ "<style scoped>\n",
188
+ " .dataframe tbody tr th:only-of-type {\n",
189
+ " vertical-align: middle;\n",
190
+ " }\n",
191
+ "\n",
192
+ " .dataframe tbody tr th {\n",
193
+ " vertical-align: top;\n",
194
+ " }\n",
195
+ "\n",
196
+ " .dataframe thead th {\n",
197
+ " text-align: right;\n",
198
+ " }\n",
199
+ "</style>\n",
200
+ "<table border=\"1\" class=\"dataframe\">\n",
201
+ " <thead>\n",
202
+ " <tr style=\"text-align: right;\">\n",
203
+ " <th></th>\n",
204
+ " <th>name</th>\n",
205
+ " <th>state</th>\n",
206
+ " <th>level4</th>\n",
207
+ " <th>level3</th>\n",
208
+ " <th>level2</th>\n",
209
+ " <th>level1</th>\n",
210
+ " <th>Hydrophobicity</th>\n",
211
+ " <th>Boiling Point</th>\n",
212
+ " <th>Molecular Weight</th>\n",
213
+ " <th>Isoelectric Point</th>\n",
214
+ " <th>...</th>\n",
215
+ " <th>Polar Surface Area (PSA)</th>\n",
216
+ " <th>Veber's Rule</th>\n",
217
+ " <th>pKa (strongest basic)</th>\n",
218
+ " <th>Ghose Filter</th>\n",
219
+ " <th>Monoisotopic Weight</th>\n",
220
+ " <th>MDDR-Like Rule</th>\n",
221
+ " <th>Polarizability</th>\n",
222
+ " <th>H Bond Acceptor Count</th>\n",
223
+ " <th>Physiological Charge</th>\n",
224
+ " <th>Rule of Five</th>\n",
225
+ " </tr>\n",
226
+ " </thead>\n",
227
+ " <tbody>\n",
228
+ " <tr>\n",
229
+ " <th>0</th>\n",
230
+ " <td>Lepirudin</td>\n",
231
+ " <td>solid</td>\n",
232
+ " <td>B01AE</td>\n",
233
+ " <td>B01A</td>\n",
234
+ " <td>B01</td>\n",
235
+ " <td>B</td>\n",
236
+ " <td>NaN</td>\n",
237
+ " <td>NaN</td>\n",
238
+ " <td>NaN</td>\n",
239
+ " <td>NaN</td>\n",
240
+ " <td>...</td>\n",
241
+ " <td>NaN</td>\n",
242
+ " <td>None</td>\n",
243
+ " <td>NaN</td>\n",
244
+ " <td>NaN</td>\n",
245
+ " <td>NaN</td>\n",
246
+ " <td>NaN</td>\n",
247
+ " <td>NaN</td>\n",
248
+ " <td>NaN</td>\n",
249
+ " <td>NaN</td>\n",
250
+ " <td>NaN</td>\n",
251
+ " </tr>\n",
252
+ " <tr>\n",
253
+ " <th>1</th>\n",
254
+ " <td>Cetuximab</td>\n",
255
+ " <td>liquid</td>\n",
256
+ " <td>L01FE</td>\n",
257
+ " <td>L01F</td>\n",
258
+ " <td>L01</td>\n",
259
+ " <td>L</td>\n",
260
+ " <td>-0.413</td>\n",
261
+ " <td>NaN</td>\n",
262
+ " <td>145781.6000</td>\n",
263
+ " <td>8.48</td>\n",
264
+ " <td>...</td>\n",
265
+ " <td>NaN</td>\n",
266
+ " <td>None</td>\n",
267
+ " <td>NaN</td>\n",
268
+ " <td>NaN</td>\n",
269
+ " <td>NaN</td>\n",
270
+ " <td>NaN</td>\n",
271
+ " <td>NaN</td>\n",
272
+ " <td>NaN</td>\n",
273
+ " <td>NaN</td>\n",
274
+ " <td>NaN</td>\n",
275
+ " </tr>\n",
276
+ " <tr>\n",
277
+ " <th>2</th>\n",
278
+ " <td>Dornase alfa</td>\n",
279
+ " <td>liquid</td>\n",
280
+ " <td>R05CB</td>\n",
281
+ " <td>R05C</td>\n",
282
+ " <td>R05</td>\n",
283
+ " <td>R</td>\n",
284
+ " <td>-0.083</td>\n",
285
+ " <td>NaN</td>\n",
286
+ " <td>29253.9000</td>\n",
287
+ " <td>4.58</td>\n",
288
+ " <td>...</td>\n",
289
+ " <td>NaN</td>\n",
290
+ " <td>None</td>\n",
291
+ " <td>NaN</td>\n",
292
+ " <td>NaN</td>\n",
293
+ " <td>NaN</td>\n",
294
+ " <td>NaN</td>\n",
295
+ " <td>NaN</td>\n",
296
+ " <td>NaN</td>\n",
297
+ " <td>NaN</td>\n",
298
+ " <td>NaN</td>\n",
299
+ " </tr>\n",
300
+ " <tr>\n",
301
+ " <th>3</th>\n",
302
+ " <td>Denileukin diftitox</td>\n",
303
+ " <td>liquid</td>\n",
304
+ " <td>L01XX</td>\n",
305
+ " <td>L01X</td>\n",
306
+ " <td>L01</td>\n",
307
+ " <td>L</td>\n",
308
+ " <td>-0.301</td>\n",
309
+ " <td>NaN</td>\n",
310
+ " <td>57647.3000</td>\n",
311
+ " <td>5.45</td>\n",
312
+ " <td>...</td>\n",
313
+ " <td>NaN</td>\n",
314
+ " <td>None</td>\n",
315
+ " <td>NaN</td>\n",
316
+ " <td>NaN</td>\n",
317
+ " <td>NaN</td>\n",
318
+ " <td>NaN</td>\n",
319
+ " <td>NaN</td>\n",
320
+ " <td>NaN</td>\n",
321
+ " <td>NaN</td>\n",
322
+ " <td>NaN</td>\n",
323
+ " </tr>\n",
324
+ " <tr>\n",
325
+ " <th>4</th>\n",
326
+ " <td>Etanercept</td>\n",
327
+ " <td>liquid</td>\n",
328
+ " <td>L04AB</td>\n",
329
+ " <td>L04A</td>\n",
330
+ " <td>L04</td>\n",
331
+ " <td>L</td>\n",
332
+ " <td>-0.529</td>\n",
333
+ " <td>NaN</td>\n",
334
+ " <td>51234.9000</td>\n",
335
+ " <td>7.89</td>\n",
336
+ " <td>...</td>\n",
337
+ " <td>NaN</td>\n",
338
+ " <td>None</td>\n",
339
+ " <td>NaN</td>\n",
340
+ " <td>NaN</td>\n",
341
+ " <td>NaN</td>\n",
342
+ " <td>NaN</td>\n",
343
+ " <td>NaN</td>\n",
344
+ " <td>NaN</td>\n",
345
+ " <td>NaN</td>\n",
346
+ " <td>NaN</td>\n",
347
+ " </tr>\n",
348
+ " <tr>\n",
349
+ " <th>...</th>\n",
350
+ " <td>...</td>\n",
351
+ " <td>...</td>\n",
352
+ " <td>...</td>\n",
353
+ " <td>...</td>\n",
354
+ " <td>...</td>\n",
355
+ " <td>...</td>\n",
356
+ " <td>...</td>\n",
357
+ " <td>...</td>\n",
358
+ " <td>...</td>\n",
359
+ " <td>...</td>\n",
360
+ " <td>...</td>\n",
361
+ " <td>...</td>\n",
362
+ " <td>...</td>\n",
363
+ " <td>...</td>\n",
364
+ " <td>...</td>\n",
365
+ " <td>...</td>\n",
366
+ " <td>...</td>\n",
367
+ " <td>...</td>\n",
368
+ " <td>...</td>\n",
369
+ " <td>...</td>\n",
370
+ " <td>...</td>\n",
371
+ " </tr>\n",
372
+ " <tr>\n",
373
+ " <th>15230</th>\n",
374
+ " <td>AUM-601</td>\n",
375
+ " <td>None</td>\n",
376
+ " <td>None</td>\n",
377
+ " <td>None</td>\n",
378
+ " <td>None</td>\n",
379
+ " <td>None</td>\n",
380
+ " <td>NaN</td>\n",
381
+ " <td>NaN</td>\n",
382
+ " <td>NaN</td>\n",
383
+ " <td>NaN</td>\n",
384
+ " <td>...</td>\n",
385
+ " <td>NaN</td>\n",
386
+ " <td>None</td>\n",
387
+ " <td>NaN</td>\n",
388
+ " <td>NaN</td>\n",
389
+ " <td>NaN</td>\n",
390
+ " <td>NaN</td>\n",
391
+ " <td>NaN</td>\n",
392
+ " <td>NaN</td>\n",
393
+ " <td>NaN</td>\n",
394
+ " <td>NaN</td>\n",
395
+ " </tr>\n",
396
+ " <tr>\n",
397
+ " <th>15231</th>\n",
398
+ " <td>FN-1501</td>\n",
399
+ " <td>None</td>\n",
400
+ " <td>None</td>\n",
401
+ " <td>None</td>\n",
402
+ " <td>None</td>\n",
403
+ " <td>None</td>\n",
404
+ " <td>NaN</td>\n",
405
+ " <td>NaN</td>\n",
406
+ " <td>431.5040</td>\n",
407
+ " <td>NaN</td>\n",
408
+ " <td>...</td>\n",
409
+ " <td>NaN</td>\n",
410
+ " <td>None</td>\n",
411
+ " <td>NaN</td>\n",
412
+ " <td>NaN</td>\n",
413
+ " <td>431.218206</td>\n",
414
+ " <td>NaN</td>\n",
415
+ " <td>NaN</td>\n",
416
+ " <td>NaN</td>\n",
417
+ " <td>NaN</td>\n",
418
+ " <td>NaN</td>\n",
419
+ " </tr>\n",
420
+ " <tr>\n",
421
+ " <th>15232</th>\n",
422
+ " <td>Tinengotinib</td>\n",
423
+ " <td>None</td>\n",
424
+ " <td>None</td>\n",
425
+ " <td>None</td>\n",
426
+ " <td>None</td>\n",
427
+ " <td>None</td>\n",
428
+ " <td>NaN</td>\n",
429
+ " <td>NaN</td>\n",
430
+ " <td>394.8600</td>\n",
431
+ " <td>NaN</td>\n",
432
+ " <td>...</td>\n",
433
+ " <td>NaN</td>\n",
434
+ " <td>None</td>\n",
435
+ " <td>NaN</td>\n",
436
+ " <td>NaN</td>\n",
437
+ " <td>394.130887</td>\n",
438
+ " <td>NaN</td>\n",
439
+ " <td>NaN</td>\n",
440
+ " <td>NaN</td>\n",
441
+ " <td>NaN</td>\n",
442
+ " <td>NaN</td>\n",
443
+ " </tr>\n",
444
+ " <tr>\n",
445
+ " <th>15233</th>\n",
446
+ " <td>Lipotecan</td>\n",
447
+ " <td>None</td>\n",
448
+ " <td>None</td>\n",
449
+ " <td>None</td>\n",
450
+ " <td>None</td>\n",
451
+ " <td>None</td>\n",
452
+ " <td>NaN</td>\n",
453
+ " <td>NaN</td>\n",
454
+ " <td>850.7100</td>\n",
455
+ " <td>NaN</td>\n",
456
+ " <td>...</td>\n",
457
+ " <td>NaN</td>\n",
458
+ " <td>None</td>\n",
459
+ " <td>NaN</td>\n",
460
+ " <td>NaN</td>\n",
461
+ " <td>850.183062</td>\n",
462
+ " <td>NaN</td>\n",
463
+ " <td>NaN</td>\n",
464
+ " <td>NaN</td>\n",
465
+ " <td>NaN</td>\n",
466
+ " <td>NaN</td>\n",
467
+ " </tr>\n",
468
+ " <tr>\n",
469
+ " <th>15234</th>\n",
470
+ " <td>Xenon Xe-129</td>\n",
471
+ " <td>None</td>\n",
472
+ " <td>None</td>\n",
473
+ " <td>None</td>\n",
474
+ " <td>None</td>\n",
475
+ " <td>None</td>\n",
476
+ " <td>NaN</td>\n",
477
+ " <td>NaN</td>\n",
478
+ " <td>128.9048</td>\n",
479
+ " <td>NaN</td>\n",
480
+ " <td>...</td>\n",
481
+ " <td>NaN</td>\n",
482
+ " <td>None</td>\n",
483
+ " <td>NaN</td>\n",
484
+ " <td>NaN</td>\n",
485
+ " <td>128.904781</td>\n",
486
+ " <td>NaN</td>\n",
487
+ " <td>NaN</td>\n",
488
+ " <td>NaN</td>\n",
489
+ " <td>NaN</td>\n",
490
+ " <td>NaN</td>\n",
491
+ " </tr>\n",
492
+ " </tbody>\n",
493
+ "</table>\n",
494
+ "<p>15235 rows × 33 columns</p>\n",
495
+ "</div>"
496
+ ],
497
+ "text/plain": [
498
+ " name state level4 level3 level2 level1 \\\n",
499
+ "0 Lepirudin solid B01AE B01A B01 B \n",
500
+ "1 Cetuximab liquid L01FE L01F L01 L \n",
501
+ "2 Dornase alfa liquid R05CB R05C R05 R \n",
502
+ "3 Denileukin diftitox liquid L01XX L01X L01 L \n",
503
+ "4 Etanercept liquid L04AB L04A L04 L \n",
504
+ "... ... ... ... ... ... ... \n",
505
+ "15230 AUM-601 None None None None None \n",
506
+ "15231 FN-1501 None None None None None \n",
507
+ "15232 Tinengotinib None None None None None \n",
508
+ "15233 Lipotecan None None None None None \n",
509
+ "15234 Xenon Xe-129 None None None None None \n",
510
+ "\n",
511
+ " Hydrophobicity Boiling Point Molecular Weight Isoelectric Point \\\n",
512
+ "0 NaN NaN NaN NaN \n",
513
+ "1 -0.413 NaN 145781.6000 8.48 \n",
514
+ "2 -0.083 NaN 29253.9000 4.58 \n",
515
+ "3 -0.301 NaN 57647.3000 5.45 \n",
516
+ "4 -0.529 NaN 51234.9000 7.89 \n",
517
+ "... ... ... ... ... \n",
518
+ "15230 NaN NaN NaN NaN \n",
519
+ "15231 NaN NaN 431.5040 NaN \n",
520
+ "15232 NaN NaN 394.8600 NaN \n",
521
+ "15233 NaN NaN 850.7100 NaN \n",
522
+ "15234 NaN NaN 128.9048 NaN \n",
523
+ "\n",
524
+ " ... Polar Surface Area (PSA) Veber's Rule pKa (strongest basic) \\\n",
525
+ "0 ... NaN None NaN \n",
526
+ "1 ... NaN None NaN \n",
527
+ "2 ... NaN None NaN \n",
528
+ "3 ... NaN None NaN \n",
529
+ "4 ... NaN None NaN \n",
530
+ "... ... ... ... ... \n",
531
+ "15230 ... NaN None NaN \n",
532
+ "15231 ... NaN None NaN \n",
533
+ "15232 ... NaN None NaN \n",
534
+ "15233 ... NaN None NaN \n",
535
+ "15234 ... NaN None NaN \n",
536
+ "\n",
537
+ " Ghose Filter Monoisotopic Weight MDDR-Like Rule Polarizability \\\n",
538
+ "0 NaN NaN NaN NaN \n",
539
+ "1 NaN NaN NaN NaN \n",
540
+ "2 NaN NaN NaN NaN \n",
541
+ "3 NaN NaN NaN NaN \n",
542
+ "4 NaN NaN NaN NaN \n",
543
+ "... ... ... ... ... \n",
544
+ "15230 NaN NaN NaN NaN \n",
545
+ "15231 NaN 431.218206 NaN NaN \n",
546
+ "15232 NaN 394.130887 NaN NaN \n",
547
+ "15233 NaN 850.183062 NaN NaN \n",
548
+ "15234 NaN 128.904781 NaN NaN \n",
549
+ "\n",
550
+ " H Bond Acceptor Count Physiological Charge Rule of Five \n",
551
+ "0 NaN NaN NaN \n",
552
+ "1 NaN NaN NaN \n",
553
+ "2 NaN NaN NaN \n",
554
+ "3 NaN NaN NaN \n",
555
+ "4 NaN NaN NaN \n",
556
+ "... ... ... ... \n",
557
+ "15230 NaN NaN NaN \n",
558
+ "15231 NaN NaN NaN \n",
559
+ "15232 NaN NaN NaN \n",
560
+ "15233 NaN NaN NaN \n",
561
+ "15234 NaN NaN NaN \n",
562
+ "\n",
563
+ "[15235 rows x 33 columns]"
564
+ ]
565
+ },
566
+ "execution_count": 175,
567
+ "metadata": {},
568
+ "output_type": "execute_result"
569
+ }
570
+ ],
571
+ "source": [
572
+ "drugbank_df = pandas.DataFrame.from_dict(rows)\n",
573
+ "drugbank_df.to_csv(\"data/full_database.csv\")\n",
574
+ "drugbank_df"
575
+ ]
576
+ },
577
+ {
578
+ "cell_type": "code",
579
+ "execution_count": 181,
580
+ "metadata": {},
581
+ "outputs": [],
582
+ "source": [
583
+ "threshold = 10\n",
584
+ "df = drugbank_df.dropna(thresh=drugbank_df.shape[1] - threshold + 1)\n",
585
+ "df = df.dropna(axis=1, thresh=df.shape[0]-1000+1)"
586
+ ]
587
+ },
588
+ {
589
+ "cell_type": "code",
590
+ "execution_count": 182,
591
+ "metadata": {},
592
+ "outputs": [
593
+ {
594
+ "data": {
595
+ "text/plain": [
596
+ "name 0\n",
597
+ "state 549\n",
598
+ "level4 35\n",
599
+ "level3 35\n",
600
+ "level2 35\n",
601
+ "level1 35\n",
602
+ "Molecular Weight 0\n",
603
+ "logP 0\n",
604
+ "Water Solubility 5\n",
605
+ "logS 27\n",
606
+ "Bioavailability 0\n",
607
+ "pKa (strongest acidic) 394\n",
608
+ "Refractivity 0\n",
609
+ "Number of Rings 0\n",
610
+ "H Bond Donor Count 0\n",
611
+ "Rotatable Bond Count 0\n",
612
+ "Polar Surface Area (PSA) 0\n",
613
+ "pKa (strongest basic) 110\n",
614
+ "Ghose Filter 0\n",
615
+ "Monoisotopic Weight 0\n",
616
+ "MDDR-Like Rule 0\n",
617
+ "Polarizability 0\n",
618
+ "H Bond Acceptor Count 0\n",
619
+ "Physiological Charge 0\n",
620
+ "Rule of Five 0\n",
621
+ "dtype: int64"
622
+ ]
623
+ },
624
+ "execution_count": 182,
625
+ "metadata": {},
626
+ "output_type": "execute_result"
627
+ }
628
+ ],
629
+ "source": [
630
+ "df.isna().sum()"
631
+ ]
632
+ },
633
+ {
634
+ "cell_type": "code",
635
+ "execution_count": 183,
636
+ "metadata": {},
637
+ "outputs": [],
638
+ "source": [
639
+ "df.to_csv('data/filtered_dataset.csv')"
640
+ ]
641
+ },
642
+ {
643
+ "cell_type": "code",
644
+ "execution_count": 184,
645
+ "metadata": {},
646
+ "outputs": [],
647
+ "source": [
648
+ "interactions = {}\n",
649
+ "# get the set of drugs in the filtered df\n",
650
+ "drugs = set(df[\"name\"])\n",
651
+ "\n",
652
+ "for i in range(15235):\n",
653
+ " drug = data['drugbank']['drug'][i]\n",
654
+ " \n",
655
+ " if drug.get(\"name\", None) in drugs:\n",
656
+ " try:\n",
657
+ " interactions[drug['name']] = [x['name'] for x in drug['drug-interactions'][\"drug-interaction\"] if x['name'] in drugs]\n",
658
+ " except:\n",
659
+ " interactions[drug['name']] = []"
660
+ ]
661
+ },
662
+ {
663
+ "cell_type": "code",
664
+ "execution_count": 185,
665
+ "metadata": {},
666
+ "outputs": [],
667
+ "source": [
668
+ "json_obj = json.dumps(interactions, indent=4)\n",
669
+ "\n",
670
+ "# output as json\n",
671
+ "with open(\"data/interactions.json\", \"w\") as outfile:\n",
672
+ " outfile.write(json_obj)"
673
+ ]
674
+ }
675
+ ],
676
+ "metadata": {
677
+ "anaconda-cloud": {},
678
+ "kernelspec": {
679
+ "display_name": "Python [default]",
680
+ "language": "python",
681
+ "name": "python3"
682
+ },
683
+ "language_info": {
684
+ "codemirror_mode": {
685
+ "name": "ipython",
686
+ "version": 3
687
+ },
688
+ "file_extension": ".py",
689
+ "mimetype": "text/x-python",
690
+ "name": "python",
691
+ "nbconvert_exporter": "python",
692
+ "pygments_lexer": "ipython3",
693
+ "version": "3.8.8"
694
+ }
695
+ },
696
+ "nbformat": 4,
697
+ "nbformat_minor": 0
698
+ }