{ "cells": [ { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "# Convert the DrugBank XML databse to JSON and extract features" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Run using Python 3 to avoid a non-ascii character error when writing to file with the csv module." ] }, { "cell_type": "code", "execution_count": 120, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import os\n", "import csv\n", "import gzip\n", "import collections\n", "import re\n", "import io\n", "import json\n", "import xml.etree.ElementTree as ET\n", "import requests\n", "import pandas\n", "import xmltodict\n", "import json" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "xml_path = \"data/full_database.xml\"\n", "json_path = \"data/full_database.json\"" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "# Read the XML file\n", "\n", "with open('data/full_database.xml', encoding=\"UTF8\") as f:\n", " db = xmltodict.parse(f.read())\n", "\n", "json_obj = json.dumps(db, indent=4)\n", "\n", "# output as json\n", "with open(\"data/full_database.json\", \"w\") as outfile:\n", " outfile.write(json_obj)" ] }, { "cell_type": "code", "execution_count": 173, "metadata": {}, "outputs": [], "source": [ "desired_props_exp = set([\"Water Solubility\",\n", " \"Melting Point\",\n", " \"Boiling Point\",\n", " \"logP\",\n", " \"logS\",\n", " \"Hydrophobicity\",\n", " \"Isoelectric Point\",\n", " \"caco2 Permeability\",\n", " \"pKa\",\n", " \"Molecular Weight\",\n", " \"Radioactivity\"])\n", "\n", "desired_props_calc = set([\"logP\",\n", " \"logS\",\n", " \"Water Solubility\",\n", " \"Molecular Weight\",\n", " \"Monoisotopic Weight\",\n", " \"Polar Surface Area (PSA)\",\n", " \"Refractivity\",\n", " \"Polarizability\",\n", " \"Rotatable Bond Count\",\n", " \"H Bond Acceptor Count\",\n", " \"H Bond Donor Count\",\n", " \"pKa (strongest acidic)\",\n", " \"pKa (strongest basic)\",\n", " \"Physiological Charge\",\n", " \"Number of Rings\",\n", " \"Bioavailability\",\n", " \"Rule of Five\",\n", " \"Ghose Filter\",\n", " \"MDDR-Like Rule\",\n", " \"Veber's Rule\"])\n", "\n", "def getProperties(desired_props, props, row):\n", " for prop in desired_props:\n", " if prop not in row:\n", " row[prop] = None\n", "\n", " try:\n", " for prop in props:\n", " if(prop['kind'] in desired_props):\n", " match = re.search(r\"[-+]?[0-9]*\\.?[0-9]+([eE][-+]?[0-9]+)?\", prop['value'])\n", " row[prop['kind']] = float(match.group(0))\n", " except:\n", " pass" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "with open(json_path) as f:\n", " data = json.load(f)" ] }, { "cell_type": "code", "execution_count": 174, "metadata": {}, "outputs": [], "source": [ "rows = []\n", "for i in range(15235):\n", " row = {}\n", " drug = data['drugbank']['drug'][i]\n", " row['name'] = drug['name']\n", " row['state'] = drug.get('state', None)\n", " atc_code = None\n", " try:\n", " atc_code = drug.get('atc-codes', dict()).get('atc-code', None)\n", " atc_code = atc_code[0]\n", " except:\n", " pass\n", "\n", " row['level4'] = None\n", " row['level3'] = None\n", " row['level2'] = None\n", " row['level1'] = None\n", " try:\n", " row['level4'] = atc_code['level'][0]['@code']\n", " row['level3'] = atc_code['level'][1]['@code']\n", " row['level2'] = atc_code['level'][2]['@code']\n", " row['level1'] = atc_code['level'][3]['@code']\n", " except:\n", " pass\n", "\n", " \n", " try:\n", " exp_props = drug['experimental-properties']['property']\n", " except:\n", " exp_props = None\n", " getProperties(desired_props_exp, exp_props, row)\n", "\n", " try:\n", " calc_props = drug['calculated-properties']['property']\n", " except:\n", " calc_props = None\n", " getProperties(desired_props_calc, calc_props, row)\n", "\n", " rows.append(row)" ] }, { "cell_type": "code", "execution_count": 175, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
| \n", " | name | \n", "state | \n", "level4 | \n", "level3 | \n", "level2 | \n", "level1 | \n", "Hydrophobicity | \n", "Boiling Point | \n", "Molecular Weight | \n", "Isoelectric Point | \n", "... | \n", "Polar Surface Area (PSA) | \n", "Veber's Rule | \n", "pKa (strongest basic) | \n", "Ghose Filter | \n", "Monoisotopic Weight | \n", "MDDR-Like Rule | \n", "Polarizability | \n", "H Bond Acceptor Count | \n", "Physiological Charge | \n", "Rule of Five | \n", "
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | \n", "Lepirudin | \n", "solid | \n", "B01AE | \n", "B01A | \n", "B01 | \n", "B | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "... | \n", "NaN | \n", "None | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
| 1 | \n", "Cetuximab | \n", "liquid | \n", "L01FE | \n", "L01F | \n", "L01 | \n", "L | \n", "-0.413 | \n", "NaN | \n", "145781.6000 | \n", "8.48 | \n", "... | \n", "NaN | \n", "None | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
| 2 | \n", "Dornase alfa | \n", "liquid | \n", "R05CB | \n", "R05C | \n", "R05 | \n", "R | \n", "-0.083 | \n", "NaN | \n", "29253.9000 | \n", "4.58 | \n", "... | \n", "NaN | \n", "None | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
| 3 | \n", "Denileukin diftitox | \n", "liquid | \n", "L01XX | \n", "L01X | \n", "L01 | \n", "L | \n", "-0.301 | \n", "NaN | \n", "57647.3000 | \n", "5.45 | \n", "... | \n", "NaN | \n", "None | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
| 4 | \n", "Etanercept | \n", "liquid | \n", "L04AB | \n", "L04A | \n", "L04 | \n", "L | \n", "-0.529 | \n", "NaN | \n", "51234.9000 | \n", "7.89 | \n", "... | \n", "NaN | \n", "None | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
| ... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
| 15230 | \n", "AUM-601 | \n", "None | \n", "None | \n", "None | \n", "None | \n", "None | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "... | \n", "NaN | \n", "None | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
| 15231 | \n", "FN-1501 | \n", "None | \n", "None | \n", "None | \n", "None | \n", "None | \n", "NaN | \n", "NaN | \n", "431.5040 | \n", "NaN | \n", "... | \n", "NaN | \n", "None | \n", "NaN | \n", "NaN | \n", "431.218206 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
| 15232 | \n", "Tinengotinib | \n", "None | \n", "None | \n", "None | \n", "None | \n", "None | \n", "NaN | \n", "NaN | \n", "394.8600 | \n", "NaN | \n", "... | \n", "NaN | \n", "None | \n", "NaN | \n", "NaN | \n", "394.130887 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
| 15233 | \n", "Lipotecan | \n", "None | \n", "None | \n", "None | \n", "None | \n", "None | \n", "NaN | \n", "NaN | \n", "850.7100 | \n", "NaN | \n", "... | \n", "NaN | \n", "None | \n", "NaN | \n", "NaN | \n", "850.183062 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
| 15234 | \n", "Xenon Xe-129 | \n", "None | \n", "None | \n", "None | \n", "None | \n", "None | \n", "NaN | \n", "NaN | \n", "128.9048 | \n", "NaN | \n", "... | \n", "NaN | \n", "None | \n", "NaN | \n", "NaN | \n", "128.904781 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
15235 rows × 33 columns
\n", "