File size: 9,995 Bytes
a062f28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
"""
parse_core.py β€” extracts core drug data.

Tables populated:
  drugs                β€” one row per drug (all scalar fields + inlined classification)
  drug_ids             β€” all DrugBank IDs (primary + secondary/legacy)
  drug_attributes      β€” multi-valued string lists (groups, synonyms, organisms,
                          food_interactions, sequences, ahfs_codes, pdb_entries,
                          classification alt_parents, substituents)
  drug_properties      β€” calculated + experimental properties (merged)
  external_identifiers β€” drug-level cross-database IDs and external links
"""
from config import NP
from utils import t, a, clean


def extract(drug_el, primary_id, state):
    """Return dict[table_name -> list[row_dict]]."""
    return {
        "drugs":                _drugs(drug_el, primary_id),
        "drug_ids":             _drug_ids(drug_el, primary_id),
        "drug_attributes":      _drug_attributes(drug_el, primary_id),
        "drug_properties":      _drug_properties(drug_el, primary_id),
        "external_identifiers": _external_identifiers(drug_el, primary_id),
    }


# ── drugs ─────────────────────────────────────────────────────────────────────

def _drugs(drug_el, primary_id):
    cls_el = drug_el.find(f"{NP}classification")
    cls = {}
    if cls_el is not None:
        cls = {
            "classification_description":   t(cls_el, "description"),
            "classification_direct_parent":  t(cls_el, "direct-parent"),
            "classification_kingdom":        t(cls_el, "kingdom"),
            "classification_superclass":     t(cls_el, "superclass"),
            "classification_class":          t(cls_el, "class"),
            "classification_subclass":       t(cls_el, "subclass"),
        }

    row = {
        "drugbank_id":      primary_id,
        "name":             t(drug_el, "name"),
        "drug_type":        drug_el.get("type"),
        "description":      t(drug_el, "description"),
        "cas_number":       t(drug_el, "cas-number"),
        "unii":             t(drug_el, "unii"),
        "average_mass":     t(drug_el, "average-mass"),
        "monoisotopic_mass": t(drug_el, "monoisotopic-mass"),
        "state":            t(drug_el, "state"),
        "indication":               t(drug_el, "indication"),
        "pharmacodynamics":         t(drug_el, "pharmacodynamics"),
        "mechanism_of_action":      t(drug_el, "mechanism-of-action"),
        "toxicity":                 t(drug_el, "toxicity"),
        "metabolism":               t(drug_el, "metabolism"),
        "absorption":               t(drug_el, "absorption"),
        "half_life":                t(drug_el, "half-life"),
        "protein_binding":          t(drug_el, "protein-binding"),
        "route_of_elimination":     t(drug_el, "route-of-elimination"),
        "volume_of_distribution":   t(drug_el, "volume-of-distribution"),
        "clearance":                t(drug_el, "clearance"),
        "synthesis_reference":      t(drug_el, "synthesis-reference"),
        "fda_label_url":            t(drug_el, "fda-label"),
        "msds_url":                 t(drug_el, "msds"),
        "created_date":  drug_el.get("created"),
        "updated_date":  drug_el.get("updated"),
        # Classification scalars (empty dict means all will be None)
        "classification_description":  cls.get("classification_description"),
        "classification_direct_parent": cls.get("classification_direct_parent"),
        "classification_kingdom":      cls.get("classification_kingdom"),
        "classification_superclass":   cls.get("classification_superclass"),
        "classification_class":        cls.get("classification_class"),
        "classification_subclass":     cls.get("classification_subclass"),
    }
    return [row]


# ── drug_ids ──────────────────────────────────────────────────────────────────

def _drug_ids(drug_el, primary_id):
    rows = []
    for id_el in drug_el.findall(f"{NP}drugbank-id"):
        val = clean(id_el.text)
        if val:
            rows.append({
                "drugbank_id": primary_id,
                "legacy_id":   val,
                "is_primary":  id_el.get("primary", "false").lower() == "true",
            })
    return rows


# ── drug_attributes ───────────────────────────────────────────────────────────

def _attr(did, atype, value, v2=None, v3=None):
    return {"drugbank_id": did, "attr_type": atype,
            "value": value, "value2": v2, "value3": v3}


def _drug_attributes(drug_el, primary_id):
    rows = []
    did = primary_id

    # Groups
    grps = drug_el.find(f"{NP}groups")
    if grps is not None:
        for g in grps.findall(f"{NP}group"):
            v = clean(g.text)
            if v:
                rows.append(_attr(did, "group", v))

    # Synonyms (with language + coder attributes)
    syns = drug_el.find(f"{NP}synonyms")
    if syns is not None:
        for s in syns.findall(f"{NP}synonym"):
            v = clean(s.text)
            if v:
                rows.append(_attr(did, "synonym", v,
                                  clean(s.get("language")),
                                  clean(s.get("coder"))))

    # Affected organisms
    ao = drug_el.find(f"{NP}affected-organisms")
    if ao is not None:
        for o in ao.findall(f"{NP}affected-organism"):
            v = clean(o.text)
            if v:
                rows.append(_attr(did, "affected_organism", v))

    # Food interactions
    fi = drug_el.find(f"{NP}food-interactions")
    if fi is not None:
        for f_ in fi.findall(f"{NP}food-interaction"):
            v = clean(f_.text)
            if v:
                rows.append(_attr(did, "food_interaction", v))

    # Sequences (biotech drugs β€” FASTA strings)
    seqs = drug_el.find(f"{NP}sequences")
    if seqs is not None:
        for seq in seqs.findall(f"{NP}sequence"):
            v = clean(seq.text)
            if v:
                rows.append(_attr(did, "sequence", v, clean(seq.get("format"))))

    # AHFS codes
    ahfs = drug_el.find(f"{NP}ahfs-codes")
    if ahfs is not None:
        for code in ahfs.findall(f"{NP}ahfs-code"):
            v = clean(code.text)
            if v:
                rows.append(_attr(did, "ahfs_code", v))

    # PDB entries
    pdb = drug_el.find(f"{NP}pdb-entries")
    if pdb is not None:
        for entry in pdb.findall(f"{NP}pdb-entry"):
            v = clean(entry.text)
            if v:
                rows.append(_attr(did, "pdb_entry", v))

    # Classification multi-valued: alternative-parents + substituents
    cls_el = drug_el.find(f"{NP}classification")
    if cls_el is not None:
        for ap in cls_el.findall(f"{NP}alternative-parent"):
            v = clean(ap.text)
            if v:
                rows.append(_attr(did, "classification_alt_parent", v))
        for sub in cls_el.findall(f"{NP}substituent"):
            v = clean(sub.text)
            if v:
                rows.append(_attr(did, "classification_substituent", v))

    return rows


# ── drug_properties ───────────────────────────────────────────────────────────

def _drug_properties(drug_el, primary_id):
    rows = []

    # Calculated properties
    calc = drug_el.find(f"{NP}calculated-properties")
    if calc is not None:
        for prop in calc.findall(f"{NP}property"):
            rows.append({
                "drugbank_id":    primary_id,
                "property_class": "calculated",
                "kind":           t(prop, "kind"),
                "value":          t(prop, "value"),
                "source":         t(prop, "source"),
            })

    # Experimental properties
    exp = drug_el.find(f"{NP}experimental-properties")
    if exp is not None:
        for prop in exp.findall(f"{NP}property"):
            rows.append({
                "drugbank_id":    primary_id,
                "property_class": "experimental",
                "kind":           t(prop, "kind"),
                "value":          t(prop, "value"),
                "source":         t(prop, "source"),
            })

    return rows


# ── external_identifiers (drug-level) ─────────────────────────────────────────

def _external_identifiers(drug_el, primary_id):
    rows = []

    # Cross-database identifiers (UniProtKB, ChEMBL, PubChem, KEGG, etc.)
    ext_ids = drug_el.find(f"{NP}external-identifiers")
    if ext_ids is not None:
        for ei in ext_ids.findall(f"{NP}external-identifier"):
            resource   = t(ei, "resource")
            identifier = t(ei, "identifier")
            if resource and identifier:
                rows.append({
                    "entity_type": "drug",
                    "entity_id":   primary_id,
                    "resource":    resource,
                    "identifier":  identifier,
                })

    # External links (RxList, PDRhealth, Drugs.com) β€” stored as identifier=url
    ext_links = drug_el.find(f"{NP}external-links")
    if ext_links is not None:
        for lnk in ext_links.findall(f"{NP}external-link"):
            resource = t(lnk, "resource")
            url      = t(lnk, "url")
            if resource and url:
                rows.append({
                    "entity_type": "drug_link",
                    "entity_id":   primary_id,
                    "resource":    resource,
                    "identifier":  url,
                })

    return rows