{ "cells": [ { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
namestatelevel4level3level2level1Molecular WeightlogPWater SolubilitylogS...Rotatable Bond CountPolar Surface Area (PSA)pKa (strongest basic)Ghose FilterMonoisotopic WeightMDDR-Like RulePolarizabilityH Bond Acceptor CountPhysiological ChargeRule of Five
5BivalirudinsolidB01AEB01AB01B2180.2853-14.000.04640-4.7...66.0901.5711.880.02178.9858131.0218.5437.0-4.00.0
6LeuprolidesolidL02AEL02AL02L1209.3983-2.400.03380-4.6...32.0429.0411.920.01208.6454621.0125.2416.01.00.0
13GoserelinsolidL02AEL02AL02L1269.4105-5.100.02830-4.6...33.0495.8910.910.01268.6414391.0130.7418.01.00.0
25Gramicidin DliquidR02ABR02AR02R1811.25305.960.00390-5.7...50.0519.89NaN0.01810.0334191.0194.7316.00.00.0
33DesmopressinsolidH01BAH01BH01H1069.2200-6.100.11000-4.0...19.0435.4111.770.01068.4269561.0104.7815.01.00.0
47CetrorelixsolidH01CCH01CH01H1431.0380-1.700.00694-5.3...38.0495.6711.790.01429.6698181.0148.9318.01.00.0
74DaptomycinsolidJ01XXJ01XJ01J1620.6930-9.400.01730-5.0...35.0702.029.590.01619.7103661.0158.9627.0-3.00.0
97AbarelixsolidL02BXL02BL02L1416.0900-0.460.00371-5.6...38.0424.9810.660.01414.6840721.0149.3116.01.00.0
105Pyridoxal phosphatesolidA11HAA11HA11A247.1419-2.105.70000-1.6...4.0116.954.110.0247.0245740.020.906.0-2.01.0
106CyanocobalaminsolidB03BAB03BB03B1355.3652-3.200.02020-4.8...27.0477.858.680.01354.5674051.0138.7918.03.00.0
\n", "

10 rows × 25 columns

\n", "
" ], "text/plain": [ " name state level4 level3 level2 level1 \\\n", "5 Bivalirudin solid B01AE B01A B01 B \n", "6 Leuprolide solid L02AE L02A L02 L \n", "13 Goserelin solid L02AE L02A L02 L \n", "25 Gramicidin D liquid R02AB R02A R02 R \n", "33 Desmopressin solid H01BA H01B H01 H \n", "47 Cetrorelix solid H01CC H01C H01 H \n", "74 Daptomycin solid J01XX J01X J01 J \n", "97 Abarelix solid L02BX L02B L02 L \n", "105 Pyridoxal phosphate solid A11HA A11H A11 A \n", "106 Cyanocobalamin solid B03BA B03B B03 B \n", "\n", " Molecular Weight logP Water Solubility logS ... \\\n", "5 2180.2853 -14.00 0.04640 -4.7 ... \n", "6 1209.3983 -2.40 0.03380 -4.6 ... \n", "13 1269.4105 -5.10 0.02830 -4.6 ... \n", "25 1811.2530 5.96 0.00390 -5.7 ... \n", "33 1069.2200 -6.10 0.11000 -4.0 ... \n", "47 1431.0380 -1.70 0.00694 -5.3 ... \n", "74 1620.6930 -9.40 0.01730 -5.0 ... \n", "97 1416.0900 -0.46 0.00371 -5.6 ... \n", "105 247.1419 -2.10 5.70000 -1.6 ... \n", "106 1355.3652 -3.20 0.02020 -4.8 ... \n", "\n", " Rotatable Bond Count Polar Surface Area (PSA) pKa (strongest basic) \\\n", "5 66.0 901.57 11.88 \n", "6 32.0 429.04 11.92 \n", "13 33.0 495.89 10.91 \n", "25 50.0 519.89 NaN \n", "33 19.0 435.41 11.77 \n", "47 38.0 495.67 11.79 \n", "74 35.0 702.02 9.59 \n", "97 38.0 424.98 10.66 \n", "105 4.0 116.95 4.11 \n", "106 27.0 477.85 8.68 \n", "\n", " Ghose Filter Monoisotopic Weight MDDR-Like Rule Polarizability \\\n", "5 0.0 2178.985813 1.0 218.54 \n", "6 0.0 1208.645462 1.0 125.24 \n", "13 0.0 1268.641439 1.0 130.74 \n", "25 0.0 1810.033419 1.0 194.73 \n", "33 0.0 1068.426956 1.0 104.78 \n", "47 0.0 1429.669818 1.0 148.93 \n", "74 0.0 1619.710366 1.0 158.96 \n", "97 0.0 1414.684072 1.0 149.31 \n", "105 0.0 247.024574 0.0 20.90 \n", "106 0.0 1354.567405 1.0 138.79 \n", "\n", " H Bond Acceptor Count Physiological Charge Rule of Five \n", "5 37.0 -4.0 0.0 \n", "6 16.0 1.0 0.0 \n", "13 18.0 1.0 0.0 \n", "25 16.0 0.0 0.0 \n", "33 15.0 1.0 0.0 \n", "47 18.0 1.0 0.0 \n", "74 27.0 -3.0 0.0 \n", "97 16.0 1.0 0.0 \n", "105 6.0 -2.0 1.0 \n", "106 18.0 3.0 0.0 \n", "\n", "[10 rows x 25 columns]" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# drop the first column\n", "df = pd.read_csv('datasets/filtered_dataset.csv', index_col=0)\n", "df.head(10)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
name_xstate_xlevel4_xlevel3_xlevel2_xlevel1_xMolecular Weight_xlogP_xWater Solubility_xlogS_x...Rotatable Bond Count_yPolar Surface Area (PSA)_ypKa (strongest basic)_yGhose Filter_yMonoisotopic Weight_yMDDR-Like Rule_yPolarizability_yH Bond Acceptor Count_yPhysiological Charge_yRule of Five_y
0BivalirudinsolidB01AEB01AB01B2180.2853-14.00.0464-4.7...66.0901.5711.880.02178.9858131.0218.5437.0-4.00.0
1BivalirudinsolidB01AEB01AB01B2180.2853-14.00.0464-4.7...32.0429.0411.920.01208.6454621.0125.2416.01.00.0
2BivalirudinsolidB01AEB01AB01B2180.2853-14.00.0464-4.7...33.0495.8910.910.01268.6414391.0130.7418.01.00.0
3BivalirudinsolidB01AEB01AB01B2180.2853-14.00.0464-4.7...50.0519.89NaN0.01810.0334191.0194.7316.00.00.0
4BivalirudinsolidB01AEB01AB01B2180.2853-14.00.0464-4.7...19.0435.4111.770.01068.4269561.0104.7815.01.00.0
..................................................................
6916895Methionine C-11NaNV09IXV09IV09V148.2100-2.223.9000-0.8...7.0104.824.110.0452.1960741.049.556.00.01.0
6916896Methionine C-11NaNV09IXV09IV09V148.2100-2.223.9000-0.8...9.0108.746.270.0497.1654281.053.396.00.01.0
6916897Methionine C-11NaNV09IXV09IV09V148.2100-2.223.9000-0.8...3.099.769.801.0404.1096250.037.187.00.01.0
6916898Methionine C-11NaNV09IXV09IV09V148.2100-2.223.9000-0.8...6.0114.40-3.500.0508.0552061.045.397.0-1.00.0
6916899Methionine C-11NaNV09IXV09IV09V148.2100-2.223.9000-0.8...4.063.329.500.0148.0624840.015.543.00.01.0
\n", "

6916900 rows × 50 columns

\n", "
" ], "text/plain": [ " name_x state_x level4_x level3_x level2_x level1_x \\\n", "0 Bivalirudin solid B01AE B01A B01 B \n", "1 Bivalirudin solid B01AE B01A B01 B \n", "2 Bivalirudin solid B01AE B01A B01 B \n", "3 Bivalirudin solid B01AE B01A B01 B \n", "4 Bivalirudin solid B01AE B01A B01 B \n", "... ... ... ... ... ... ... \n", "6916895 Methionine C-11 NaN V09IX V09I V09 V \n", "6916896 Methionine C-11 NaN V09IX V09I V09 V \n", "6916897 Methionine C-11 NaN V09IX V09I V09 V \n", "6916898 Methionine C-11 NaN V09IX V09I V09 V \n", "6916899 Methionine C-11 NaN V09IX V09I V09 V \n", "\n", " Molecular Weight_x logP_x Water Solubility_x logS_x ... \\\n", "0 2180.2853 -14.0 0.0464 -4.7 ... \n", "1 2180.2853 -14.0 0.0464 -4.7 ... \n", "2 2180.2853 -14.0 0.0464 -4.7 ... \n", "3 2180.2853 -14.0 0.0464 -4.7 ... \n", "4 2180.2853 -14.0 0.0464 -4.7 ... \n", "... ... ... ... ... ... \n", "6916895 148.2100 -2.2 23.9000 -0.8 ... \n", "6916896 148.2100 -2.2 23.9000 -0.8 ... \n", "6916897 148.2100 -2.2 23.9000 -0.8 ... \n", "6916898 148.2100 -2.2 23.9000 -0.8 ... \n", "6916899 148.2100 -2.2 23.9000 -0.8 ... \n", "\n", " Rotatable Bond Count_y Polar Surface Area (PSA)_y \\\n", "0 66.0 901.57 \n", "1 32.0 429.04 \n", "2 33.0 495.89 \n", "3 50.0 519.89 \n", "4 19.0 435.41 \n", "... ... ... \n", "6916895 7.0 104.82 \n", "6916896 9.0 108.74 \n", "6916897 3.0 99.76 \n", "6916898 6.0 114.40 \n", "6916899 4.0 63.32 \n", "\n", " pKa (strongest basic)_y Ghose Filter_y Monoisotopic Weight_y \\\n", "0 11.88 0.0 2178.985813 \n", "1 11.92 0.0 1208.645462 \n", "2 10.91 0.0 1268.641439 \n", "3 NaN 0.0 1810.033419 \n", "4 11.77 0.0 1068.426956 \n", "... ... ... ... \n", "6916895 4.11 0.0 452.196074 \n", "6916896 6.27 0.0 497.165428 \n", "6916897 9.80 1.0 404.109625 \n", "6916898 -3.50 0.0 508.055206 \n", "6916899 9.50 0.0 148.062484 \n", "\n", " MDDR-Like Rule_y Polarizability_y H Bond Acceptor Count_y \\\n", "0 1.0 218.54 37.0 \n", "1 1.0 125.24 16.0 \n", "2 1.0 130.74 18.0 \n", "3 1.0 194.73 16.0 \n", "4 1.0 104.78 15.0 \n", "... ... ... ... \n", "6916895 1.0 49.55 6.0 \n", "6916896 1.0 53.39 6.0 \n", "6916897 0.0 37.18 7.0 \n", "6916898 1.0 45.39 7.0 \n", "6916899 0.0 15.54 3.0 \n", "\n", " Physiological Charge_y Rule of Five_y \n", "0 -4.0 0.0 \n", "1 1.0 0.0 \n", "2 1.0 0.0 \n", "3 0.0 0.0 \n", "4 1.0 0.0 \n", "... ... ... \n", "6916895 0.0 1.0 \n", "6916896 0.0 1.0 \n", "6916897 0.0 1.0 \n", "6916898 -1.0 0.0 \n", "6916899 0.0 1.0 \n", "\n", "[6916900 rows x 50 columns]" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# cross two datasets to get all drug pairs\n", "df1 = pd.read_csv('datasets/filtered_dataset.csv', index_col=0)\n", "df2 = pd.read_csv('datasets/filtered_dataset.csv', index_col=0)\n", "\n", "df3 = pd.merge(df1, df2, how='cross')\n", "df3" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "df3.to_csv('datasets/drug_pairs.csv')" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(6916900, 50) (2630, 25)\n" ] } ], "source": [ "print(df3.shape, df1.shape)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from itertools import combinations\n", "drug_pairs = list(combinations(df['name'], 2))\n", "\n", "# Create an empty dataframe to store the pairwise combinations and features\n", "col1 = [x+\"_d1\" for x in df.columns[1:]]\n", "col2 = [x+\"_d2\" for x in df.columns[1:]]\n", "\n", "df_pairs = pd.DataFrame(columns=['drug1', 'drug2', *col1, *col2])\n", "\n", "# Iterate through the drug pairs and populate the dataframe\n", "for drug1, drug2 in drug_pairs:\n", " features_drug1 = df[df['name'] == drug1][[*(df.columns[1:])]].values.flatten()\n", " features_drug2 = df[df['name'] == drug2][[*(df.columns[1:])]].values.flatten()\n", " row = pd.DataFrame([[drug1, drug2, *features_drug1, *features_drug2]], columns=df_pairs.columns)\n", " df_pairs = df_pairs.append(row, ignore_index=True)\n", "\n", "# Print the resulting pairwise combinations and features dataframe\n", "print(df_pairs)\n" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/bprimal/.local/lib/python3.8/site-packages/IPython/core/interactiveshell.py:3442: DtypeWarning: Columns (2) have mixed types.Specify dtype option on import or set low_memory=False.\n", " exec(code_obj, self.user_global_ns, self.user_ns)\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
name_xstate_xlevel4_xlevel3_xlevel2_xlevel1_xMolecular Weight_xlogP_xWater Solubility_xlogS_x...Rotatable Bond Count_yPolar Surface Area (PSA)_ypKa (strongest basic)_yGhose Filter_yMonoisotopic Weight_yMDDR-Like Rule_yPolarizability_yH Bond Acceptor Count_yPhysiological Charge_yRule of Five_y
0BivalirudinsolidB01AEB01AB01B2180.2853-14.00.0464-4.7...66.0901.5711.880.02178.9858131.0218.5437.0-4.00.0
1BivalirudinsolidB01AEB01AB01B2180.2853-14.00.0464-4.7...32.0429.0411.920.01208.6454621.0125.2416.01.00.0
2BivalirudinsolidB01AEB01AB01B2180.2853-14.00.0464-4.7...33.0495.8910.910.01268.6414391.0130.7418.01.00.0
3BivalirudinsolidB01AEB01AB01B2180.2853-14.00.0464-4.7...50.0519.89NaN0.01810.0334191.0194.7316.00.00.0
4BivalirudinsolidB01AEB01AB01B2180.2853-14.00.0464-4.7...19.0435.4111.770.01068.4269561.0104.7815.01.00.0
5BivalirudinsolidB01AEB01AB01B2180.2853-14.00.0464-4.7...38.0495.6711.790.01429.6698181.0148.9318.01.00.0
6BivalirudinsolidB01AEB01AB01B2180.2853-14.00.0464-4.7...35.0702.029.590.01619.7103661.0158.9627.0-3.00.0
7BivalirudinsolidB01AEB01AB01B2180.2853-14.00.0464-4.7...38.0424.9810.660.01414.6840721.0149.3116.01.00.0
8BivalirudinsolidB01AEB01AB01B2180.2853-14.00.0464-4.7...4.0116.954.110.0247.0245740.020.906.0-2.01.0
9BivalirudinsolidB01AEB01AB01B2180.2853-14.00.0464-4.7...27.0477.858.680.01354.5674051.0138.7918.03.00.0
\n", "

10 rows × 50 columns

\n", "
" ], "text/plain": [ " name_x state_x level4_x level3_x level2_x level1_x \\\n", "0 Bivalirudin solid B01AE B01A B01 B \n", "1 Bivalirudin solid B01AE B01A B01 B \n", "2 Bivalirudin solid B01AE B01A B01 B \n", "3 Bivalirudin solid B01AE B01A B01 B \n", "4 Bivalirudin solid B01AE B01A B01 B \n", "5 Bivalirudin solid B01AE B01A B01 B \n", "6 Bivalirudin solid B01AE B01A B01 B \n", "7 Bivalirudin solid B01AE B01A B01 B \n", "8 Bivalirudin solid B01AE B01A B01 B \n", "9 Bivalirudin solid B01AE B01A B01 B \n", "\n", " Molecular Weight_x logP_x Water Solubility_x logS_x ... \\\n", "0 2180.2853 -14.0 0.0464 -4.7 ... \n", "1 2180.2853 -14.0 0.0464 -4.7 ... \n", "2 2180.2853 -14.0 0.0464 -4.7 ... \n", "3 2180.2853 -14.0 0.0464 -4.7 ... \n", "4 2180.2853 -14.0 0.0464 -4.7 ... \n", "5 2180.2853 -14.0 0.0464 -4.7 ... \n", "6 2180.2853 -14.0 0.0464 -4.7 ... \n", "7 2180.2853 -14.0 0.0464 -4.7 ... \n", "8 2180.2853 -14.0 0.0464 -4.7 ... \n", "9 2180.2853 -14.0 0.0464 -4.7 ... \n", "\n", " Rotatable Bond Count_y Polar Surface Area (PSA)_y \\\n", "0 66.0 901.57 \n", "1 32.0 429.04 \n", "2 33.0 495.89 \n", "3 50.0 519.89 \n", "4 19.0 435.41 \n", "5 38.0 495.67 \n", "6 35.0 702.02 \n", "7 38.0 424.98 \n", "8 4.0 116.95 \n", "9 27.0 477.85 \n", "\n", " pKa (strongest basic)_y Ghose Filter_y Monoisotopic Weight_y \\\n", "0 11.88 0.0 2178.985813 \n", "1 11.92 0.0 1208.645462 \n", "2 10.91 0.0 1268.641439 \n", "3 NaN 0.0 1810.033419 \n", "4 11.77 0.0 1068.426956 \n", "5 11.79 0.0 1429.669818 \n", "6 9.59 0.0 1619.710366 \n", "7 10.66 0.0 1414.684072 \n", "8 4.11 0.0 247.024574 \n", "9 8.68 0.0 1354.567405 \n", "\n", " MDDR-Like Rule_y Polarizability_y H Bond Acceptor Count_y \\\n", "0 1.0 218.54 37.0 \n", "1 1.0 125.24 16.0 \n", "2 1.0 130.74 18.0 \n", "3 1.0 194.73 16.0 \n", "4 1.0 104.78 15.0 \n", "5 1.0 148.93 18.0 \n", "6 1.0 158.96 27.0 \n", "7 1.0 149.31 16.0 \n", "8 0.0 20.90 6.0 \n", "9 1.0 138.79 18.0 \n", "\n", " Physiological Charge_y Rule of Five_y \n", "0 -4.0 0.0 \n", "1 1.0 0.0 \n", "2 1.0 0.0 \n", "3 0.0 0.0 \n", "4 1.0 0.0 \n", "5 1.0 0.0 \n", "6 -3.0 0.0 \n", "7 1.0 0.0 \n", "8 -2.0 1.0 \n", "9 3.0 0.0 \n", "\n", "[10 rows x 50 columns]" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas as pd\n", "\n", "catboost_df = pd.read_csv('datasets/drug_pairs.csv', index_col=0)\n", "catboost_df.head(10)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "import json\n", "\n", "with open('interactions.json', 'r') as f:\n", " interactions = json.load(f)\n" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [], "source": [ "# Create a new column in the dataframe to store the interaction label\n", "# For each drug pair, check if the interaction is present in the interactions dictionary\n", "# If yes, assign 1, else 0\n", "catboost_df['interaction'] = catboost_df.apply(lambda x: 1 if x['name_y'] in interactions.get(x['name_x'], list()) else 0, axis=1)" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
drug1drug2interaction
0hey1hello11
1hey2hello21
2hey3hello30
3hey4hello40
\n", "
" ], "text/plain": [ " drug1 drug2 interaction\n", "0 hey1 hello1 1\n", "1 hey2 hello2 1\n", "2 hey3 hello3 0\n", "3 hey4 hello4 0" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "'''\n", "Dummy example to show how to use the interactions dictionary\n", "'''\n", "\n", "dummy = pd.DataFrame({'drug1': ['hey1', 'hey2', 'hey3', 'hey4'], 'drug2': ['hello1', 'hello2', 'hello3', 'hello4']}, columns=['drug1', 'drug2'])\n", "i = {\n", " 'hey1': ['hello1'],\n", " 'hey2': ['hello1', 'hello2'],\n", " 'hey3': ['hello4'],\n", " 'hey4': [],\n", " 'hey5': ['hello1', 'hello2', 'hello3', 'hello4']\n", "}\n", "dummy['interaction'] = dummy.apply(lambda x: 1 if x['drug2'] in i.get(x['drug1'], list()) else 0, axis=1)\n", "dummy.head()" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 40, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAhYAAAGsCAYAAACB/u5dAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/MnkTPAAAACXBIWXMAAA9hAAAPYQGoP6dpAAAcw0lEQVR4nO3de5CVdf3A8c+yLAfRXRU3FHLxQoMWgjfS8PITTSJ1SGsqJ4rIvFSiUzJdNDOWvJGjjo2ZmZrkTEjZhJUiQhYymk5KMOElFfGaoqK5C2wdDrvP749mN3EBOev3POtZX68Z/tiHZ5/z4cPieXvO2T01WZZlAQCQQL/eHgAA6DuEBQCQjLAAAJIRFgBAMsICAEhGWAAAyQgLACAZYQEAJCMsAIBkhAUAkEyvhcWSJUti0qRJMWzYsKipqYnbbrut7GtkWRaXX355jBw5MgqFQrz//e+Piy++OP2wAMA26d9bN7x+/frYf//948tf/nJ86lOf6tE1vv71r8fChQvj8ssvj9GjR8frr78er7/+euJJAYBtVfNueBOympqamDdvXpx00kldx4rFYpx//vlxyy23xBtvvBH77bdf/PCHP4zx48dHRMRjjz0WY8aMiYcffjj22Wef3hkcANjEu/Y1FmeddVbcf//9MXfu3Pj73/8en/nMZ+LjH/94PPnkkxER8Yc//CH23nvvuP3222OvvfaKPffcM0477TSPWABAL3pXhsVzzz0XN910U9x6661x5JFHxogRI+Kb3/xmHHHEEXHTTTdFRMSqVavi2WefjVtvvTVuvvnmmD17dixdujQ+/elP9/L0APDe1WuvsdiaFStWRHt7e4wcOXKT48ViMXbZZZeIiOjo6IhisRg333xz13k33nhjHHzwwfH44497egQAesG7MizWrVsXtbW1sXTp0qitrd3k93bYYYeIiBg6dGj0799/k/j44Ac/GBH/fcRDWABA/t6VYXHggQdGe3t7vPLKK3HkkUdu9pzDDz88Nm7cGE899VSMGDEiIiKeeOKJiIjYY489cpsVAPifXvuukHXr1sXKlSsj4r8hceWVV8bRRx8dgwcPjuHDh8cXvvCFuO++++KKK66IAw88MF599dW4++67Y8yYMXHCCSdER0dHfPjDH44ddtghrrrqqujo6Ihp06ZFQ0NDLFy4sDf+SADwntdrYbF48eI4+uijux2fOnVqzJ49O0qlUlx00UVx8803xz//+c9obGyMj3zkIzFz5swYPXp0RES8+OKLcfbZZ8fChQtj++23j+OOOy6uuOKKGDx4cN5/HAAg3iU/xwIA6Bveld9uCgBUJ2EBACST+3eFdHR0xIsvvhj19fVRU1OT980DAD2QZVmsXbs2hg0bFv36bflxidzD4sUXX4ympqa8bxYASOD555+P3XfffYu/n3tY1NfXR8R/B2toaEh23VKpFAsXLoyPfexjUVdXl+y6bMqe82PX+bDnfNhzPiq559bW1mhqauq6H9+S3MOi8+mPhoaG5GExaNCgaGho8EVbQfacH7vOhz3nw57zkcee3+5lDF68CQAkIywAgGSEBQCQjLAAAJIRFgBAMsICAEhGWAAAyQgLACAZYQEAJCMsAIBkhAUAkIywAACSERYAQDLCAgBIJve3Ta+0/ZrvimL71t/S9d3kmVkn9PYIAJCMRywAgGSEBQCQjLAAAJIRFgBAMsICAEhGWAAAyQgLACAZYQEAJCMsAIBkhAUAkIywAACSERYAQDLCAgBIRlgAAMkICwAgGWEBACQjLACAZIQFAJCMsAAAkhEWAEAywgIASEZYAADJCAsAIBlhAQAkIywAgGSEBQCQjLAAAJIRFgBAMsICAEhGWAAAyQgLACCZssKiubk5ampqNvm17777Vmo2AKDK9C/3E0aNGhV//OMf/3eB/mVfAgDoo8qugv79+8duu+1WiVkAgCpXdlg8+eSTMWzYsBg4cGCMGzcuLr300hg+fPgWzy8Wi1EsFrs+bm1tjYiIUqkUpVKpByNvXue1Cv2yZNfMQ8od5KFz3mqbuxrZdT7sOR/2nI9K7nlbr1mTZdk23xPfeeedsW7duthnn33ipZdeipkzZ8Y///nPePjhh6O+vn6zn9Pc3BwzZ87sdnzOnDkxaNCgbb1pAKAXtbW1xeTJk6OlpSUaGhq2eF5ZYfFWb7zxRuyxxx5x5ZVXxqmnnrrZczb3iEVTU1OsWbNmq4OVq1QqxaJFi+KCh/pFsaMm2XUr7eHmib09Qlk69zxhwoSoq6vr7XH6NLvOhz3nw57zUck9t7a2RmNj49uGxTt65eVOO+0UI0eOjJUrV27xnEKhEIVCodvxurq6inxxFTtqothePWFRrf/AKvX3R3d2nQ97zoc956MSe97W672jn2Oxbt26eOqpp2Lo0KHv5DIAQB9RVlh885vfjHvuuSeeeeaZ+Mtf/hKf/OQno7a2Nj73uc9Vaj4AoIqU9VTICy+8EJ/73Ofitddei/e9731xxBFHxAMPPBDve9/7KjUfAFBFygqLuXPnVmoOAKAP8F4hAEAywgIASEZYAADJCAsAIBlhAQAkIywAgGSEBQCQjLAAAJIRFgBAMsICAEhGWAAAyQgLACAZYQEAJCMsAIBkhAUAkIywAACSERYAQDLCAgBIRlgAAMkICwAgGWEBACQjLACAZIQFAJCMsAAAkhEWAEAywgIASEZYAADJCAsAIBlhAQAkIywAgGSEBQCQjLAAAJIRFgBAMsICAEhGWAAAyQgLACAZYQEAJCMsAIBkhAUAkIywAACSERYAQDLCAgBIRlgAAMkICwAgGWEBACQjLACAZIQFAJCMsAAAkhEWAEAywgIASEZYAADJCAsAIJl3FBazZs2Kmpqa+MY3vpFoHACgmvU4LB588MG47rrrYsyYMSnnAQCqWI/CYt26dfH5z38+rr/++th5551TzwQAVKn+PfmkadOmxQknnBDHHntsXHTRRVs9t1gsRrFY7Pq4tbU1IiJKpVKUSqWe3PxmdV6r0C9Lds08pNxBHjrnrba5q5Fd58Oe82HP+ajknrf1mjVZlpV1Tzx37ty4+OKL48EHH4yBAwfG+PHj44ADDoirrrpqs+c3NzfHzJkzux2fM2dODBo0qJybBgB6SVtbW0yePDlaWlqioaFhi+eVFRbPP/98jB07NhYtWtT12oq3C4vNPWLR1NQUa9as2epg5SqVSrFo0aK44KF+UeyoSXbdSnu4eWJvj1CWzj1PmDAh6urqenucPs2u82HP+bDnfFRyz62trdHY2Pi2YVHWUyFLly6NV155JQ466KCuY+3t7bFkyZL48Y9/HMViMWprazf5nEKhEIVCodu16urqKvLFVeyoiWJ79YRFtf4Dq9TfH93ZdT7sOR/2nI9K7Hlbr1dWWHz0ox+NFStWbHLslFNOiX333Te+853vdIsKAOC9paywqK+vj/3222+TY9tvv33ssssu3Y4DAO89fvImAJBMj77d9M0WL16cYAwAoC/wiAUAkIywAACSERYAQDLCAgBIRlgAAMkICwAgGWEBACQjLACAZIQFAJCMsAAAkhEWAEAywgIASEZYAADJCAsAIBlhAQAkIywAgGSEBQCQjLAAAJIRFgBAMsICAEhGWAAAyQgLACAZYQEAJCMsAIBkhAUAkIywAACSERYAQDLCAgBIRlgAAMkICwAgGWEBACQjLACAZIQFAJCMsAAAkhEWAEAywgIASEZYAADJCAsAIBlhAQAkIywAgGSEBQCQjLAAAJIRFgBAMsICAEhGWAAAyQgLACAZYQEAJCMsAIBkhAUAkIywAACSERYAQDLCAgBIRlgAAMmUFRbXXnttjBkzJhoaGqKhoSHGjRsXd955Z6VmAwCqTFlhsfvuu8esWbNi6dKl8dBDD8UxxxwTJ554YjzyyCOVmg8AqCL9yzl50qRJm3x88cUXx7XXXhsPPPBAjBo1KulgAED1KSss3qy9vT1uvfXWWL9+fYwbN26L5xWLxSgWi10ft7a2RkREqVSKUqnU05vvpvNahX5ZsmvmIeUO8tA5b7XNXY3sOh/2nA97zkcl97yt16zJsqyse+IVK1bEuHHj4j//+U/ssMMOMWfOnDj++OO3eH5zc3PMnDmz2/E5c+bEoEGDyrlpAKCXtLW1xeTJk6OlpSUaGhq2eF7ZYbFhw4Z47rnnoqWlJX7zm9/EDTfcEPfcc0986EMf2uz5m3vEoqmpKdasWbPVwcpVKpVi0aJFccFD/aLYUZPsupX2cPPE3h6hLJ17njBhQtTV1fX2OH2aXefDnvNhz/mo5J5bW1ujsbHxbcOi7KdCBgwYEB/4wAciIuLggw+OBx98MH70ox/Fddddt9nzC4VCFAqFbsfr6uoq8sVV7KiJYnv1hEW1/gOr1N8f3dl1Puw5H/acj0rseVuv945/jkVHR8cmj0gAAO9dZT1icd5558Vxxx0Xw4cPj7Vr18acOXNi8eLFcdddd1VqPgCgipQVFq+88kp88YtfjJdeeil23HHHGDNmTNx1110xYcKESs0HAFSRssLixhtvrNQcAEAf4L1CAIBkhAUAkIywAACSERYAQDLCAgBIRlgAAMkICwAgGWEBACQjLACAZIQFAJCMsAAAkhEWAEAywgIASEZYAADJCAsAIBlhAQAkIywAgGSEBQCQjLAAAJIRFgBAMsICAEhGWAAAyQgLACAZYQEAJCMsAIBkhAUAkIywAACSERYAQDLCAgBIRlgAAMkICwAgGWEBACQjLACAZIQFAJCMsAAAkhEWAEAywgIASEZYAADJCAsAIBlhAQAkIywAgGSEBQCQjLAAAJIRFgBAMsICAEhGWAAAyQgLACAZYQEAJCMsAIBkhAUAkIywAACSERYAQDJlhcWll14aH/7wh6O+vj6GDBkSJ510Ujz++OOVmg0AqDJlhcU999wT06ZNiwceeCAWLVoUpVIpPvaxj8X69esrNR8AUEX6l3PyggULNvl49uzZMWTIkFi6dGn83//9X9LBAIDqU1ZYvFVLS0tERAwePHiL5xSLxSgWi10ft7a2RkREqVSKUqn0Tm5+E53XKvTLkl0zDyl3kIfOeatt7mpk1/mw53zYcz4quedtvWZNlmU9uifu6OiIT3ziE/HGG2/Evffeu8XzmpubY+bMmd2Oz5kzJwYNGtSTmwYActbW1haTJ0+OlpaWaGho2OJ5PQ6Lr33ta3HnnXfGvffeG7vvvvsWz9vcIxZNTU2xZs2arQ5WrlKpFIsWLYoLHuoXxY6aZNettIebJ/b2CGXp3POECROirq6ut8fp0+w6H/acD3vORyX33NraGo2NjW8bFj16KuSss86K22+/PZYsWbLVqIiIKBQKUSgUuh2vq6uryBdXsaMmiu3VExbV+g+sUn9/dGfX+bDnfNhzPiqx5229XllhkWVZnH322TFv3rxYvHhx7LXXXj0aDgDom8oKi2nTpsWcOXPid7/7XdTX18fq1asjImLHHXeM7bbbriIDAgDVo6yfY3HttddGS0tLjB8/PoYOHdr161e/+lWl5gMAqkjZT4UAAGyJ9woBAJIRFgBAMsICAEhGWAAAyQgLACAZYQEAJCMsAIBkhAUAkIywAACSERYAQDLCAgBIRlgAAMkICwAgGWEBACQjLACAZIQFAJCMsAAAkhEWAEAywgIASEZYAADJCAsAIBlhAQAkIywAgGSEBQCQjLAAAJIRFgBAMsICAEhGWAAAyQgLACAZYQEAJCMsAIBkhAUAkIywAACSERYAQDL9e3sAAHi32vPcO3p7hLIUarO47JDencEjFgBAMsICAEhGWAAAyQgLACAZYQEAJCMsAIBkhAUAkIywAACSERYAQDLCAgBIRlgAAMkICwAgGWEBACQjLACAZIQFAJCMsAAAkhEWAEAyZYfFkiVLYtKkSTFs2LCoqamJ2267rQJjAQDVqOywWL9+fey///5xzTXXVGIeAKCK9S/3E4477rg47rjjKjELAFDlyg6LchWLxSgWi10ft7a2RkREqVSKUqmU7HY6r1XolyW7Zh5S7iAPnfNW29zVyK7zYc/5qNY9F2qr6z6l8z6wEnve1mvWZFnW463V1NTEvHnz4qSTTtriOc3NzTFz5sxux+fMmRODBg3q6U0DADlqa2uLyZMnR0tLSzQ0NGzxvIqHxeYesWhqaoo1a9ZsdbBylUqlWLRoUVzwUL8odtQku26lPdw8sbdHKEvnnidMmBB1dXW9PU6fZtf5sOd8VOue92u+q7dHKEuhXxYXju2oyJ5bW1ujsbHxbcOi4k+FFAqFKBQK3Y7X1dVV5Iur2FETxfbqCYtq+gf2ZpX6+6M7u86HPeej2vZcTfcnb1aJPW/r9fwcCwAgmbIfsVi3bl2sXLmy6+Onn346li9fHoMHD47hw4cnHQ4AqC5lh8VDDz0URx99dNfH06dPj4iIqVOnxuzZs5MNBgBUn7LDYvz48fEOXu8JAPRhXmMBACQjLACAZIQFAJCMsAAAkhEWAEAywgIASEZYAADJCAsAIBlhAQAkIywAgGSEBQCQjLAAAJIRFgBAMsICAEhGWAAAyQgLACAZYQEAJCMsAIBkhAUAkIywAACSERYAQDLCAgBIRlgAAMkICwAgGWEBACQjLACAZIQFAJCMsAAAkhEWAEAywgIASEZYAADJCAsAIBlhAQAkIywAgGSEBQCQjLAAAJIRFgBAMsICAEhGWAAAyQgLACAZYQEAJCMsAIBkhAUAkIywAACSERYAQDLCAgBIRlgAAMkICwAgGWEBACQjLACAZIQFAJCMsAAAkulRWFxzzTWx5557xsCBA+PQQw+Nv/71r6nnAgCqUNlh8atf/SqmT58eM2bMiL/97W+x//77x8SJE+OVV16pxHwAQBUpOyyuvPLKOP300+OUU06JD33oQ/HTn/40Bg0aFD//+c8rMR8AUEX6l3Pyhg0bYunSpXHeeed1HevXr18ce+yxcf/992/2c4rFYhSLxa6PW1paIiLi9ddfj1Kp1JOZN6tUKkVbW1v0L/WL9o6aZNettNdee623RyhL555fe+21qKur6+1x+jS7zoc956Na99x/4/reHqEs/TuyaGvrqMie165dGxERWZZtfYZyLrpmzZpob2+PXXfddZPju+66a/zjH//Y7OdceumlMXPmzG7H99prr3Juus9qvKK3JwCgL5lc4euvXbs2dtxxxy3+fllh0RPnnXdeTJ8+vevjjo6OeP3112OXXXaJmpp0jyy0trZGU1NTPP/889HQ0JDsumzKnvNj1/mw53zYcz4quecsy2Lt2rUxbNiwrZ5XVlg0NjZGbW1tvPzyy5scf/nll2O33Xbb7OcUCoUoFAqbHNtpp53KudmyNDQ0+KLNgT3nx67zYc/5sOd8VGrPW3ukolNZL94cMGBAHHzwwXH33Xd3Hevo6Ii77747xo0bV/6EAECfUvZTIdOnT4+pU6fG2LFj45BDDomrrroq1q9fH6ecckol5gMAqkjZYXHyySfHq6++Gt///vdj9erVccABB8SCBQu6vaAzb4VCIWbMmNHtaRfSsuf82HU+7Dkf9pyPd8Oea7K3+74RAIBt5L1CAIBkhAUAkIywAACSERYAQDJVFRblvl37rbfeGvvuu28MHDgwRo8eHfPnz89p0upWzp6vv/76OPLII2PnnXeOnXfeOY499ti3/Xvhv8r9eu40d+7cqKmpiZNOOqmyA/Yh5e76jTfeiGnTpsXQoUOjUCjEyJEj/fdjG5S756uuuir22Wef2G677aKpqSnOOeec+M9//pPTtNVpyZIlMWnSpBg2bFjU1NTEbbfd9rafs3jx4jjooIOiUCjEBz7wgZg9e3Zlh8yqxNy5c7MBAwZkP//5z7NHHnkkO/3007Oddtope/nllzd7/n333ZfV1tZml112Wfboo49m3/ve97K6urpsxYoVOU9eXcrd8+TJk7NrrrkmW7ZsWfbYY49lX/rSl7Idd9wxe+GFF3KevLqUu+dOTz/9dPb+978/O/LII7MTTzwxn2GrXLm7LhaL2dixY7Pjjz8+u/fee7Onn346W7x4cbZ8+fKcJ68u5e75l7/8ZVYoFLJf/vKX2dNPP53ddddd2dChQ7Nzzjkn58mry/z587Pzzz8/++1vf5tFRDZv3rytnr9q1aps0KBB2fTp07NHH300u/rqq7Pa2tpswYIFFZuxasLikEMOyaZNm9b1cXt7ezZs2LDs0ksv3ez5n/3sZ7MTTjhhk2OHHnpo9pWvfKWic1a7cvf8Vhs3bszq6+uzX/ziF5UasU/oyZ43btyYHXbYYdkNN9yQTZ06VVhso3J3fe2112Z77713tmHDhrxG7BPK3fO0adOyY445ZpNj06dPzw4//PCKztmXbEtYfPvb385GjRq1ybGTTz45mzhxYsXmqoqnQjrfrv3YY4/tOvZ2b9d+//33b3J+RMTEiRO3eD492/NbtbW1RalUisGDB1dqzKrX0z3/4Ac/iCFDhsSpp56ax5h9Qk92/fvf/z7GjRsX06ZNi1133TX222+/uOSSS6K9vT2vsatOT/Z82GGHxdKlS7ueLlm1alXMnz8/jj/++Fxmfq/ojfvCir+7aQo9ebv21atXb/b81atXV2zOateTPb/Vd77znRg2bFi3L2T+pyd7vvfee+PGG2+M5cuX5zBh39GTXa9atSr+9Kc/xec///mYP39+rFy5Ms4888wolUoxY8aMPMauOj3Z8+TJk2PNmjVxxBFHRJZlsXHjxvjqV78a3/3ud/MY+T1jS/eFra2t8e9//zu222675LdZFY9YUB1mzZoVc+fOjXnz5sXAgQN7e5w+Y+3atTFlypS4/vrro7GxsbfH6fM6OjpiyJAh8bOf/SwOPvjgOPnkk+P888+Pn/70p709Wp+yePHiuOSSS+InP/lJ/O1vf4vf/va3cccdd8SFF17Y26PxDlXFIxY9ebv23Xbbrazz6dmeO11++eUxa9as+OMf/xhjxoyp5JhVr9w9P/XUU/HMM8/EpEmTuo51dHRERET//v3j8ccfjxEjRlR26CrVk6/poUOHRl1dXdTW1nYd++AHPxirV6+ODRs2xIABAyo6czXqyZ4vuOCCmDJlSpx22mkRETF69OhYv359nHHGGXH++edHv37+vzeFLd0XNjQ0VOTRiogqecSiJ2/XPm7cuE3Oj4hYtGiRt3ffip7sOSLisssuiwsvvDAWLFgQY8eOzWPUqlbunvfdd99YsWJFLF++vOvXJz7xiTj66KNj+fLl0dTUlOf4VaUnX9OHH354rFy5siveIiKeeOKJGDp0qKjYgp7sua2trVs8dMZc5i2skumV+8KKvSw0sblz52aFQiGbPXt29uijj2ZnnHFGttNOO2WrV6/OsizLpkyZkp177rld5993331Z//79s8svvzx77LHHshkzZvh2021Q7p5nzZqVDRgwIPvNb36TvfTSS12/1q5d21t/hKpQ7p7fyneFbLtyd/3cc89l9fX12VlnnZU9/vjj2e23354NGTIku+iii3rrj1AVyt3zjBkzsvr6+uyWW27JVq1alS1cuDAbMWJE9tnPfra3/ghVYe3atdmyZcuyZcuWZRGRXXnlldmyZcuyZ599NsuyLDv33HOzKVOmdJ3f+e2m3/rWt7LHHnssu+aaa3y76ZtdffXV2fDhw7MBAwZkhxxySPbAAw90/d5RRx2VTZ06dZPzf/3rX2cjR47MBgwYkI0aNSq74447cp64OpWz5z322COLiG6/ZsyYkf/gVabcr+c3ExblKXfXf/nLX7JDDz00KxQK2d57751dfPHF2caNG3OeuvqUs+dSqZQ1NzdnI0aMyAYOHJg1NTVlZ555Zvavf/0r/8GryJ///OfN/je3c7dTp07NjjrqqG6fc8ABB2QDBgzI9t577+ymm26q6IzeNh0ASKYqXmMBAFQHYQEAJCMsAIBkhAUAkIywAACSERYAQDLCAgBIRlgAAMkICwAgGWEBACQjLACAZIQFAJDM/wPtyEe9ddUmcAAAAABJRU5ErkJggg==", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "catboost_df[\"interaction\"].hist()\n" ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 5565768\n", "1 1351132\n", "Name: interaction, dtype: int64" ] }, "execution_count": 45, "metadata": {}, "output_type": "execute_result" } ], "source": [ "catboost_df['interaction'].value_counts()" ] }, { "cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [], "source": [ "catboost_df.to_csv('datasets/catboost_df.csv')" ] }, { "cell_type": "code", "execution_count": 49, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
state_xlevel4_xlevel3_xlevel2_xlevel1_xMolecular Weight_xlogP_xWater Solubility_xlogS_xBioavailability_x...Polar Surface Area (PSA)_ypKa (strongest basic)_yGhose Filter_yMonoisotopic Weight_yMDDR-Like Rule_yPolarizability_yH Bond Acceptor Count_yPhysiological Charge_yRule of Five_yinteraction
0solidB01AEB01AB01B2180.2853-14.00.0464-4.70.0...901.5711.880.02178.9858131.0218.5437.0-4.00.00
1solidB01AEB01AB01B2180.2853-14.00.0464-4.70.0...429.0411.920.01208.6454621.0125.2416.01.00.00
2solidB01AEB01AB01B2180.2853-14.00.0464-4.70.0...495.8910.910.01268.6414391.0130.7418.01.00.00
3solidB01AEB01AB01B2180.2853-14.00.0464-4.70.0...519.89NaN0.01810.0334191.0194.7316.00.00.00
4solidB01AEB01AB01B2180.2853-14.00.0464-4.70.0...435.4111.770.01068.4269561.0104.7815.01.00.00
5solidB01AEB01AB01B2180.2853-14.00.0464-4.70.0...495.6711.790.01429.6698181.0148.9318.01.00.00
6solidB01AEB01AB01B2180.2853-14.00.0464-4.70.0...702.029.590.01619.7103661.0158.9627.0-3.00.00
7solidB01AEB01AB01B2180.2853-14.00.0464-4.70.0...424.9810.660.01414.6840721.0149.3116.01.00.00
8solidB01AEB01AB01B2180.2853-14.00.0464-4.70.0...116.954.110.0247.0245740.020.906.0-2.01.00
9solidB01AEB01AB01B2180.2853-14.00.0464-4.70.0...477.858.680.01354.5674051.0138.7918.03.00.00
\n", "

10 rows × 49 columns

\n", "
" ], "text/plain": [ " state_x level4_x level3_x level2_x level1_x Molecular Weight_x logP_x \\\n", "0 solid B01AE B01A B01 B 2180.2853 -14.0 \n", "1 solid B01AE B01A B01 B 2180.2853 -14.0 \n", "2 solid B01AE B01A B01 B 2180.2853 -14.0 \n", "3 solid B01AE B01A B01 B 2180.2853 -14.0 \n", "4 solid B01AE B01A B01 B 2180.2853 -14.0 \n", "5 solid B01AE B01A B01 B 2180.2853 -14.0 \n", "6 solid B01AE B01A B01 B 2180.2853 -14.0 \n", "7 solid B01AE B01A B01 B 2180.2853 -14.0 \n", "8 solid B01AE B01A B01 B 2180.2853 -14.0 \n", "9 solid B01AE B01A B01 B 2180.2853 -14.0 \n", "\n", " Water Solubility_x logS_x Bioavailability_x ... \\\n", "0 0.0464 -4.7 0.0 ... \n", "1 0.0464 -4.7 0.0 ... \n", "2 0.0464 -4.7 0.0 ... \n", "3 0.0464 -4.7 0.0 ... \n", "4 0.0464 -4.7 0.0 ... \n", "5 0.0464 -4.7 0.0 ... \n", "6 0.0464 -4.7 0.0 ... \n", "7 0.0464 -4.7 0.0 ... \n", "8 0.0464 -4.7 0.0 ... \n", "9 0.0464 -4.7 0.0 ... \n", "\n", " Polar Surface Area (PSA)_y pKa (strongest basic)_y Ghose Filter_y \\\n", "0 901.57 11.88 0.0 \n", "1 429.04 11.92 0.0 \n", "2 495.89 10.91 0.0 \n", "3 519.89 NaN 0.0 \n", "4 435.41 11.77 0.0 \n", "5 495.67 11.79 0.0 \n", "6 702.02 9.59 0.0 \n", "7 424.98 10.66 0.0 \n", "8 116.95 4.11 0.0 \n", "9 477.85 8.68 0.0 \n", "\n", " Monoisotopic Weight_y MDDR-Like Rule_y Polarizability_y \\\n", "0 2178.985813 1.0 218.54 \n", "1 1208.645462 1.0 125.24 \n", "2 1268.641439 1.0 130.74 \n", "3 1810.033419 1.0 194.73 \n", "4 1068.426956 1.0 104.78 \n", "5 1429.669818 1.0 148.93 \n", "6 1619.710366 1.0 158.96 \n", "7 1414.684072 1.0 149.31 \n", "8 247.024574 0.0 20.90 \n", "9 1354.567405 1.0 138.79 \n", "\n", " H Bond Acceptor Count_y Physiological Charge_y Rule of Five_y \\\n", "0 37.0 -4.0 0.0 \n", "1 16.0 1.0 0.0 \n", "2 18.0 1.0 0.0 \n", "3 16.0 0.0 0.0 \n", "4 15.0 1.0 0.0 \n", "5 18.0 1.0 0.0 \n", "6 27.0 -3.0 0.0 \n", "7 16.0 1.0 0.0 \n", "8 6.0 -2.0 1.0 \n", "9 18.0 3.0 0.0 \n", "\n", " interaction \n", "0 0 \n", "1 0 \n", "2 0 \n", "3 0 \n", "4 0 \n", "5 0 \n", "6 0 \n", "7 0 \n", "8 0 \n", "9 0 \n", "\n", "[10 rows x 49 columns]" ] }, "execution_count": 49, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# drop label name_x and name_y\n", "catboost_df = catboost_df.drop(['name_x', 'name_y'], axis=1)\n", "catboost_df.head(10)" ] }, { "cell_type": "code", "execution_count": 68, "metadata": {}, "outputs": [], "source": [ "# create test and train set\n", "from sklearn.model_selection import train_test_split\n", "X, y = catboost_df.drop('interaction', axis=1), catboost_df['interaction']\n", "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)" ] }, { "cell_type": "code", "execution_count": 66, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['state_x', 'level4_x', 'level3_x', 'level2_x', 'level1_x', 'state_y', 'level4_y', 'level3_y', 'level2_y', 'level1_y']\n", "['state_x', 'level4_x', 'level3_x', 'level2_x', 'level1_x', 'Molecular Weight_x', 'logP_x', 'Water Solubility_x', 'logS_x', 'Bioavailability_x', 'pKa (strongest acidic)_x', 'Refractivity_x', 'Number of Rings_x', 'H Bond Donor Count_x', 'Rotatable Bond Count_x', 'Polar Surface Area (PSA)_x', 'pKa (strongest basic)_x', 'Ghose Filter_x', 'Monoisotopic Weight_x', 'MDDR-Like Rule_x', 'Polarizability_x', 'H Bond Acceptor Count_x', 'Physiological Charge_x', 'Rule of Five_x', 'state_y', 'level4_y', 'level3_y', 'level2_y', 'level1_y', 'Molecular Weight_y', 'logP_y', 'Water Solubility_y', 'logS_y', 'Bioavailability_y', 'pKa (strongest acidic)_y', 'Refractivity_y', 'Number of Rings_y', 'H Bond Donor Count_y', 'Rotatable Bond Count_y', 'Polar Surface Area (PSA)_y', 'pKa (strongest basic)_y', 'Ghose Filter_y', 'Monoisotopic Weight_y', 'MDDR-Like Rule_y', 'Polarizability_y', 'H Bond Acceptor Count_y', 'Physiological Charge_y', 'Rule of Five_y', 'interaction']\n", "['Molecular Weight_x', 'logP_x', 'Water Solubility_x', 'logS_x', 'Bioavailability_x', 'pKa (strongest acidic)_x', 'Refractivity_x', 'Number of Rings_x', 'H Bond Donor Count_x', 'Rotatable Bond Count_x', 'Polar Surface Area (PSA)_x', 'pKa (strongest basic)_x', 'Ghose Filter_x', 'Monoisotopic Weight_x', 'MDDR-Like Rule_x', 'Polarizability_x', 'H Bond Acceptor Count_x', 'Physiological Charge_x', 'Rule of Five_x', 'Molecular Weight_y', 'logP_y', 'Water Solubility_y', 'logS_y', 'Bioavailability_y', 'pKa (strongest acidic)_y', 'Refractivity_y', 'Number of Rings_y', 'H Bond Donor Count_y', 'Rotatable Bond Count_y', 'Polar Surface Area (PSA)_y', 'pKa (strongest basic)_y', 'Ghose Filter_y', 'Monoisotopic Weight_y', 'MDDR-Like Rule_y', 'Polarizability_y', 'H Bond Acceptor Count_y', 'Physiological Charge_y', 'Rule of Five_y']\n", "10 + 38 = 49\n" ] } ], "source": [ "# get all the columns whose dtype is object\n", "cat_features = list(catboost_df.select_dtypes(include=['object']).columns)\n", "print(cat_features)\n", "print(list(catboost_df.columns))\n", "float_features = list(catboost_df.select_dtypes(include=['float64']).columns)\n", "print(float_features)\n", "print(f\"{len(cat_features)} + {len(float_features)} = {len(catboost_df.columns)}\")" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.10" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }