{ "cells": [ { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "# Convert the DrugBank XML databse to JSON and extract features" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Run using Python 3 to avoid a non-ascii character error when writing to file with the csv module." ] }, { "cell_type": "code", "execution_count": 120, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import os\n", "import csv\n", "import gzip\n", "import collections\n", "import re\n", "import io\n", "import json\n", "import xml.etree.ElementTree as ET\n", "import requests\n", "import pandas\n", "import xmltodict\n", "import json" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "xml_path = \"data/full_database.xml\"\n", "json_path = \"data/full_database.json\"" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "# Read the XML file\n", "\n", "with open('data/full_database.xml', encoding=\"UTF8\") as f:\n", " db = xmltodict.parse(f.read())\n", "\n", "json_obj = json.dumps(db, indent=4)\n", "\n", "# output as json\n", "with open(\"data/full_database.json\", \"w\") as outfile:\n", " outfile.write(json_obj)" ] }, { "cell_type": "code", "execution_count": 173, "metadata": {}, "outputs": [], "source": [ "desired_props_exp = set([\"Water Solubility\",\n", " \"Melting Point\",\n", " \"Boiling Point\",\n", " \"logP\",\n", " \"logS\",\n", " \"Hydrophobicity\",\n", " \"Isoelectric Point\",\n", " \"caco2 Permeability\",\n", " \"pKa\",\n", " \"Molecular Weight\",\n", " \"Radioactivity\"])\n", "\n", "desired_props_calc = set([\"logP\",\n", " \"logS\",\n", " \"Water Solubility\",\n", " \"Molecular Weight\",\n", " \"Monoisotopic Weight\",\n", " \"Polar Surface Area (PSA)\",\n", " \"Refractivity\",\n", " \"Polarizability\",\n", " \"Rotatable Bond Count\",\n", " \"H Bond Acceptor Count\",\n", " \"H Bond Donor Count\",\n", " \"pKa (strongest acidic)\",\n", " \"pKa (strongest basic)\",\n", " \"Physiological Charge\",\n", " \"Number of Rings\",\n", " \"Bioavailability\",\n", " \"Rule of Five\",\n", " \"Ghose Filter\",\n", " \"MDDR-Like Rule\",\n", " \"Veber's Rule\"])\n", "\n", "def getProperties(desired_props, props, row):\n", " for prop in desired_props:\n", " if prop not in row:\n", " row[prop] = None\n", "\n", " try:\n", " for prop in props:\n", " if(prop['kind'] in desired_props):\n", " match = re.search(r\"[-+]?[0-9]*\\.?[0-9]+([eE][-+]?[0-9]+)?\", prop['value'])\n", " row[prop['kind']] = float(match.group(0))\n", " except:\n", " pass" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "with open(json_path) as f:\n", " data = json.load(f)" ] }, { "cell_type": "code", "execution_count": 174, "metadata": {}, "outputs": [], "source": [ "rows = []\n", "for i in range(15235):\n", " row = {}\n", " drug = data['drugbank']['drug'][i]\n", " row['name'] = drug['name']\n", " row['state'] = drug.get('state', None)\n", " atc_code = None\n", " try:\n", " atc_code = drug.get('atc-codes', dict()).get('atc-code', None)\n", " atc_code = atc_code[0]\n", " except:\n", " pass\n", "\n", " row['level4'] = None\n", " row['level3'] = None\n", " row['level2'] = None\n", " row['level1'] = None\n", " try:\n", " row['level4'] = atc_code['level'][0]['@code']\n", " row['level3'] = atc_code['level'][1]['@code']\n", " row['level2'] = atc_code['level'][2]['@code']\n", " row['level1'] = atc_code['level'][3]['@code']\n", " except:\n", " pass\n", "\n", " \n", " try:\n", " exp_props = drug['experimental-properties']['property']\n", " except:\n", " exp_props = None\n", " getProperties(desired_props_exp, exp_props, row)\n", "\n", " try:\n", " calc_props = drug['calculated-properties']['property']\n", " except:\n", " calc_props = None\n", " getProperties(desired_props_calc, calc_props, row)\n", "\n", " rows.append(row)" ] }, { "cell_type": "code", "execution_count": 175, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
namestatelevel4level3level2level1HydrophobicityBoiling PointMolecular WeightIsoelectric Point...Polar Surface Area (PSA)Veber's RulepKa (strongest basic)Ghose FilterMonoisotopic WeightMDDR-Like RulePolarizabilityH Bond Acceptor CountPhysiological ChargeRule of Five
0LepirudinsolidB01AEB01AB01BNaNNaNNaNNaN...NaNNoneNaNNaNNaNNaNNaNNaNNaNNaN
1CetuximabliquidL01FEL01FL01L-0.413NaN145781.60008.48...NaNNoneNaNNaNNaNNaNNaNNaNNaNNaN
2Dornase alfaliquidR05CBR05CR05R-0.083NaN29253.90004.58...NaNNoneNaNNaNNaNNaNNaNNaNNaNNaN
3Denileukin diftitoxliquidL01XXL01XL01L-0.301NaN57647.30005.45...NaNNoneNaNNaNNaNNaNNaNNaNNaNNaN
4EtanerceptliquidL04ABL04AL04L-0.529NaN51234.90007.89...NaNNoneNaNNaNNaNNaNNaNNaNNaNNaN
..................................................................
15230AUM-601NoneNoneNoneNoneNoneNaNNaNNaNNaN...NaNNoneNaNNaNNaNNaNNaNNaNNaNNaN
15231FN-1501NoneNoneNoneNoneNoneNaNNaN431.5040NaN...NaNNoneNaNNaN431.218206NaNNaNNaNNaNNaN
15232TinengotinibNoneNoneNoneNoneNoneNaNNaN394.8600NaN...NaNNoneNaNNaN394.130887NaNNaNNaNNaNNaN
15233LipotecanNoneNoneNoneNoneNoneNaNNaN850.7100NaN...NaNNoneNaNNaN850.183062NaNNaNNaNNaNNaN
15234Xenon Xe-129NoneNoneNoneNoneNoneNaNNaN128.9048NaN...NaNNoneNaNNaN128.904781NaNNaNNaNNaNNaN
\n", "

15235 rows × 33 columns

\n", "
" ], "text/plain": [ " name state level4 level3 level2 level1 \\\n", "0 Lepirudin solid B01AE B01A B01 B \n", "1 Cetuximab liquid L01FE L01F L01 L \n", "2 Dornase alfa liquid R05CB R05C R05 R \n", "3 Denileukin diftitox liquid L01XX L01X L01 L \n", "4 Etanercept liquid L04AB L04A L04 L \n", "... ... ... ... ... ... ... \n", "15230 AUM-601 None None None None None \n", "15231 FN-1501 None None None None None \n", "15232 Tinengotinib None None None None None \n", "15233 Lipotecan None None None None None \n", "15234 Xenon Xe-129 None None None None None \n", "\n", " Hydrophobicity Boiling Point Molecular Weight Isoelectric Point \\\n", "0 NaN NaN NaN NaN \n", "1 -0.413 NaN 145781.6000 8.48 \n", "2 -0.083 NaN 29253.9000 4.58 \n", "3 -0.301 NaN 57647.3000 5.45 \n", "4 -0.529 NaN 51234.9000 7.89 \n", "... ... ... ... ... \n", "15230 NaN NaN NaN NaN \n", "15231 NaN NaN 431.5040 NaN \n", "15232 NaN NaN 394.8600 NaN \n", "15233 NaN NaN 850.7100 NaN \n", "15234 NaN NaN 128.9048 NaN \n", "\n", " ... Polar Surface Area (PSA) Veber's Rule pKa (strongest basic) \\\n", "0 ... NaN None NaN \n", "1 ... NaN None NaN \n", "2 ... NaN None NaN \n", "3 ... NaN None NaN \n", "4 ... NaN None NaN \n", "... ... ... ... ... \n", "15230 ... NaN None NaN \n", "15231 ... NaN None NaN \n", "15232 ... NaN None NaN \n", "15233 ... NaN None NaN \n", "15234 ... NaN None NaN \n", "\n", " Ghose Filter Monoisotopic Weight MDDR-Like Rule Polarizability \\\n", "0 NaN NaN NaN NaN \n", "1 NaN NaN NaN NaN \n", "2 NaN NaN NaN NaN \n", "3 NaN NaN NaN NaN \n", "4 NaN NaN NaN NaN \n", "... ... ... ... ... \n", "15230 NaN NaN NaN NaN \n", "15231 NaN 431.218206 NaN NaN \n", "15232 NaN 394.130887 NaN NaN \n", "15233 NaN 850.183062 NaN NaN \n", "15234 NaN 128.904781 NaN NaN \n", "\n", " H Bond Acceptor Count Physiological Charge Rule of Five \n", "0 NaN NaN NaN \n", "1 NaN NaN NaN \n", "2 NaN NaN NaN \n", "3 NaN NaN NaN \n", "4 NaN NaN NaN \n", "... ... ... ... \n", "15230 NaN NaN NaN \n", "15231 NaN NaN NaN \n", "15232 NaN NaN NaN \n", "15233 NaN NaN NaN \n", "15234 NaN NaN NaN \n", "\n", "[15235 rows x 33 columns]" ] }, "execution_count": 175, "metadata": {}, "output_type": "execute_result" } ], "source": [ "drugbank_df = pandas.DataFrame.from_dict(rows)\n", "drugbank_df.to_csv(\"data/full_database.csv\")\n", "drugbank_df" ] }, { "cell_type": "code", "execution_count": 181, "metadata": {}, "outputs": [], "source": [ "threshold = 10\n", "df = drugbank_df.dropna(thresh=drugbank_df.shape[1] - threshold + 1)\n", "df = df.dropna(axis=1, thresh=df.shape[0]-1000+1)" ] }, { "cell_type": "code", "execution_count": 182, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "name 0\n", "state 549\n", "level4 35\n", "level3 35\n", "level2 35\n", "level1 35\n", "Molecular Weight 0\n", "logP 0\n", "Water Solubility 5\n", "logS 27\n", "Bioavailability 0\n", "pKa (strongest acidic) 394\n", "Refractivity 0\n", "Number of Rings 0\n", "H Bond Donor Count 0\n", "Rotatable Bond Count 0\n", "Polar Surface Area (PSA) 0\n", "pKa (strongest basic) 110\n", "Ghose Filter 0\n", "Monoisotopic Weight 0\n", "MDDR-Like Rule 0\n", "Polarizability 0\n", "H Bond Acceptor Count 0\n", "Physiological Charge 0\n", "Rule of Five 0\n", "dtype: int64" ] }, "execution_count": 182, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.isna().sum()" ] }, { "cell_type": "code", "execution_count": 183, "metadata": {}, "outputs": [], "source": [ "df.to_csv('data/filtered_dataset.csv')" ] }, { "cell_type": "code", "execution_count": 184, "metadata": {}, "outputs": [], "source": [ "interactions = {}\n", "# get the set of drugs in the filtered df\n", "drugs = set(df[\"name\"])\n", "\n", "for i in range(15235):\n", " drug = data['drugbank']['drug'][i]\n", " \n", " if drug.get(\"name\", None) in drugs:\n", " try:\n", " interactions[drug['name']] = [x['name'] for x in drug['drug-interactions'][\"drug-interaction\"] if x['name'] in drugs]\n", " except:\n", " interactions[drug['name']] = []" ] }, { "cell_type": "code", "execution_count": 185, "metadata": {}, "outputs": [], "source": [ "json_obj = json.dumps(interactions, indent=4)\n", "\n", "# output as json\n", "with open(\"data/interactions.json\", \"w\") as outfile:\n", " outfile.write(json_obj)" ] } ], "metadata": { "anaconda-cloud": {}, "kernelspec": { "display_name": "Python [default]", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.8" } }, "nbformat": 4, "nbformat_minor": 0 }