ddi-checker / parser /parse_core.py
marwadeeb's picture
added data parsing stage
a062f28
"""
parse_core.py β€” extracts core drug data.
Tables populated:
drugs β€” one row per drug (all scalar fields + inlined classification)
drug_ids β€” all DrugBank IDs (primary + secondary/legacy)
drug_attributes β€” multi-valued string lists (groups, synonyms, organisms,
food_interactions, sequences, ahfs_codes, pdb_entries,
classification alt_parents, substituents)
drug_properties β€” calculated + experimental properties (merged)
external_identifiers β€” drug-level cross-database IDs and external links
"""
from config import NP
from utils import t, a, clean
def extract(drug_el, primary_id, state):
"""Return dict[table_name -> list[row_dict]]."""
return {
"drugs": _drugs(drug_el, primary_id),
"drug_ids": _drug_ids(drug_el, primary_id),
"drug_attributes": _drug_attributes(drug_el, primary_id),
"drug_properties": _drug_properties(drug_el, primary_id),
"external_identifiers": _external_identifiers(drug_el, primary_id),
}
# ── drugs ─────────────────────────────────────────────────────────────────────
def _drugs(drug_el, primary_id):
cls_el = drug_el.find(f"{NP}classification")
cls = {}
if cls_el is not None:
cls = {
"classification_description": t(cls_el, "description"),
"classification_direct_parent": t(cls_el, "direct-parent"),
"classification_kingdom": t(cls_el, "kingdom"),
"classification_superclass": t(cls_el, "superclass"),
"classification_class": t(cls_el, "class"),
"classification_subclass": t(cls_el, "subclass"),
}
row = {
"drugbank_id": primary_id,
"name": t(drug_el, "name"),
"drug_type": drug_el.get("type"),
"description": t(drug_el, "description"),
"cas_number": t(drug_el, "cas-number"),
"unii": t(drug_el, "unii"),
"average_mass": t(drug_el, "average-mass"),
"monoisotopic_mass": t(drug_el, "monoisotopic-mass"),
"state": t(drug_el, "state"),
"indication": t(drug_el, "indication"),
"pharmacodynamics": t(drug_el, "pharmacodynamics"),
"mechanism_of_action": t(drug_el, "mechanism-of-action"),
"toxicity": t(drug_el, "toxicity"),
"metabolism": t(drug_el, "metabolism"),
"absorption": t(drug_el, "absorption"),
"half_life": t(drug_el, "half-life"),
"protein_binding": t(drug_el, "protein-binding"),
"route_of_elimination": t(drug_el, "route-of-elimination"),
"volume_of_distribution": t(drug_el, "volume-of-distribution"),
"clearance": t(drug_el, "clearance"),
"synthesis_reference": t(drug_el, "synthesis-reference"),
"fda_label_url": t(drug_el, "fda-label"),
"msds_url": t(drug_el, "msds"),
"created_date": drug_el.get("created"),
"updated_date": drug_el.get("updated"),
# Classification scalars (empty dict means all will be None)
"classification_description": cls.get("classification_description"),
"classification_direct_parent": cls.get("classification_direct_parent"),
"classification_kingdom": cls.get("classification_kingdom"),
"classification_superclass": cls.get("classification_superclass"),
"classification_class": cls.get("classification_class"),
"classification_subclass": cls.get("classification_subclass"),
}
return [row]
# ── drug_ids ──────────────────────────────────────────────────────────────────
def _drug_ids(drug_el, primary_id):
rows = []
for id_el in drug_el.findall(f"{NP}drugbank-id"):
val = clean(id_el.text)
if val:
rows.append({
"drugbank_id": primary_id,
"legacy_id": val,
"is_primary": id_el.get("primary", "false").lower() == "true",
})
return rows
# ── drug_attributes ───────────────────────────────────────────────────────────
def _attr(did, atype, value, v2=None, v3=None):
return {"drugbank_id": did, "attr_type": atype,
"value": value, "value2": v2, "value3": v3}
def _drug_attributes(drug_el, primary_id):
rows = []
did = primary_id
# Groups
grps = drug_el.find(f"{NP}groups")
if grps is not None:
for g in grps.findall(f"{NP}group"):
v = clean(g.text)
if v:
rows.append(_attr(did, "group", v))
# Synonyms (with language + coder attributes)
syns = drug_el.find(f"{NP}synonyms")
if syns is not None:
for s in syns.findall(f"{NP}synonym"):
v = clean(s.text)
if v:
rows.append(_attr(did, "synonym", v,
clean(s.get("language")),
clean(s.get("coder"))))
# Affected organisms
ao = drug_el.find(f"{NP}affected-organisms")
if ao is not None:
for o in ao.findall(f"{NP}affected-organism"):
v = clean(o.text)
if v:
rows.append(_attr(did, "affected_organism", v))
# Food interactions
fi = drug_el.find(f"{NP}food-interactions")
if fi is not None:
for f_ in fi.findall(f"{NP}food-interaction"):
v = clean(f_.text)
if v:
rows.append(_attr(did, "food_interaction", v))
# Sequences (biotech drugs β€” FASTA strings)
seqs = drug_el.find(f"{NP}sequences")
if seqs is not None:
for seq in seqs.findall(f"{NP}sequence"):
v = clean(seq.text)
if v:
rows.append(_attr(did, "sequence", v, clean(seq.get("format"))))
# AHFS codes
ahfs = drug_el.find(f"{NP}ahfs-codes")
if ahfs is not None:
for code in ahfs.findall(f"{NP}ahfs-code"):
v = clean(code.text)
if v:
rows.append(_attr(did, "ahfs_code", v))
# PDB entries
pdb = drug_el.find(f"{NP}pdb-entries")
if pdb is not None:
for entry in pdb.findall(f"{NP}pdb-entry"):
v = clean(entry.text)
if v:
rows.append(_attr(did, "pdb_entry", v))
# Classification multi-valued: alternative-parents + substituents
cls_el = drug_el.find(f"{NP}classification")
if cls_el is not None:
for ap in cls_el.findall(f"{NP}alternative-parent"):
v = clean(ap.text)
if v:
rows.append(_attr(did, "classification_alt_parent", v))
for sub in cls_el.findall(f"{NP}substituent"):
v = clean(sub.text)
if v:
rows.append(_attr(did, "classification_substituent", v))
return rows
# ── drug_properties ───────────────────────────────────────────────────────────
def _drug_properties(drug_el, primary_id):
rows = []
# Calculated properties
calc = drug_el.find(f"{NP}calculated-properties")
if calc is not None:
for prop in calc.findall(f"{NP}property"):
rows.append({
"drugbank_id": primary_id,
"property_class": "calculated",
"kind": t(prop, "kind"),
"value": t(prop, "value"),
"source": t(prop, "source"),
})
# Experimental properties
exp = drug_el.find(f"{NP}experimental-properties")
if exp is not None:
for prop in exp.findall(f"{NP}property"):
rows.append({
"drugbank_id": primary_id,
"property_class": "experimental",
"kind": t(prop, "kind"),
"value": t(prop, "value"),
"source": t(prop, "source"),
})
return rows
# ── external_identifiers (drug-level) ─────────────────────────────────────────
def _external_identifiers(drug_el, primary_id):
rows = []
# Cross-database identifiers (UniProtKB, ChEMBL, PubChem, KEGG, etc.)
ext_ids = drug_el.find(f"{NP}external-identifiers")
if ext_ids is not None:
for ei in ext_ids.findall(f"{NP}external-identifier"):
resource = t(ei, "resource")
identifier = t(ei, "identifier")
if resource and identifier:
rows.append({
"entity_type": "drug",
"entity_id": primary_id,
"resource": resource,
"identifier": identifier,
})
# External links (RxList, PDRhealth, Drugs.com) β€” stored as identifier=url
ext_links = drug_el.find(f"{NP}external-links")
if ext_links is not None:
for lnk in ext_links.findall(f"{NP}external-link"):
resource = t(lnk, "resource")
url = t(lnk, "url")
if resource and url:
rows.append({
"entity_type": "drug_link",
"entity_id": primary_id,
"resource": resource,
"identifier": url,
})
return rows