Spaces:

marwadeeb
/

ddi-checker

Sleeping

File size: 9,995 Bytes

a062f28

"""
parse_core.py — extracts core drug data.

Tables populated:
  drugs                — one row per drug (all scalar fields + inlined classification)
  drug_ids             — all DrugBank IDs (primary + secondary/legacy)
  drug_attributes      — multi-valued string lists (groups, synonyms, organisms,
                          food_interactions, sequences, ahfs_codes, pdb_entries,
                          classification alt_parents, substituents)
  drug_properties      — calculated + experimental properties (merged)
  external_identifiers — drug-level cross-database IDs and external links
"""
from config import NP
from utils import t, a, clean


def extract(drug_el, primary_id, state):
    """Return dict[table_name -> list[row_dict]]."""
    return {
        "drugs":                _drugs(drug_el, primary_id),
        "drug_ids":             _drug_ids(drug_el, primary_id),
        "drug_attributes":      _drug_attributes(drug_el, primary_id),
        "drug_properties":      _drug_properties(drug_el, primary_id),
        "external_identifiers": _external_identifiers(drug_el, primary_id),
    }


# ── drugs ─────────────────────────────────────────────────────────────────────

def _drugs(drug_el, primary_id):
    cls_el = drug_el.find(f"{NP}classification")
    cls = {}
    if cls_el is not None:
        cls = {
            "classification_description":   t(cls_el, "description"),
            "classification_direct_parent":  t(cls_el, "direct-parent"),
            "classification_kingdom":        t(cls_el, "kingdom"),
            "classification_superclass":     t(cls_el, "superclass"),
            "classification_class":          t(cls_el, "class"),
            "classification_subclass":       t(cls_el, "subclass"),
        }

    row = {
        "drugbank_id":      primary_id,
        "name":             t(drug_el, "name"),
        "drug_type":        drug_el.get("type"),
        "description":      t(drug_el, "description"),
        "cas_number":       t(drug_el, "cas-number"),
        "unii":             t(drug_el, "unii"),
        "average_mass":     t(drug_el, "average-mass"),
        "monoisotopic_mass": t(drug_el, "monoisotopic-mass"),
        "state":            t(drug_el, "state"),
        "indication":               t(drug_el, "indication"),
        "pharmacodynamics":         t(drug_el, "pharmacodynamics"),
        "mechanism_of_action":      t(drug_el, "mechanism-of-action"),
        "toxicity":                 t(drug_el, "toxicity"),
        "metabolism":               t(drug_el, "metabolism"),
        "absorption":               t(drug_el, "absorption"),
        "half_life":                t(drug_el, "half-life"),
        "protein_binding":          t(drug_el, "protein-binding"),
        "route_of_elimination":     t(drug_el, "route-of-elimination"),
        "volume_of_distribution":   t(drug_el, "volume-of-distribution"),
        "clearance":                t(drug_el, "clearance"),
        "synthesis_reference":      t(drug_el, "synthesis-reference"),
        "fda_label_url":            t(drug_el, "fda-label"),
        "msds_url":                 t(drug_el, "msds"),
        "created_date":  drug_el.get("created"),
        "updated_date":  drug_el.get("updated"),
        # Classification scalars (empty dict means all will be None)
        "classification_description":  cls.get("classification_description"),
        "classification_direct_parent": cls.get("classification_direct_parent"),
        "classification_kingdom":      cls.get("classification_kingdom"),
        "classification_superclass":   cls.get("classification_superclass"),
        "classification_class":        cls.get("classification_class"),
        "classification_subclass":     cls.get("classification_subclass"),
    }
    return [row]


# ── drug_ids ──────────────────────────────────────────────────────────────────

def _drug_ids(drug_el, primary_id):
    rows = []
    for id_el in drug_el.findall(f"{NP}drugbank-id"):
        val = clean(id_el.text)
        if val:
            rows.append({
                "drugbank_id": primary_id,
                "legacy_id":   val,
                "is_primary":  id_el.get("primary", "false").lower() == "true",
            })
    return rows


# ── drug_attributes ───────────────────────────────────────────────────────────

def _attr(did, atype, value, v2=None, v3=None):
    return {"drugbank_id": did, "attr_type": atype,
            "value": value, "value2": v2, "value3": v3}


def _drug_attributes(drug_el, primary_id):
    rows = []
    did = primary_id

    # Groups
    grps = drug_el.find(f"{NP}groups")
    if grps is not None:
        for g in grps.findall(f"{NP}group"):
            v = clean(g.text)
            if v:
                rows.append(_attr(did, "group", v))

    # Synonyms (with language + coder attributes)
    syns = drug_el.find(f"{NP}synonyms")
    if syns is not None:
        for s in syns.findall(f"{NP}synonym"):
            v = clean(s.text)
            if v:
                rows.append(_attr(did, "synonym", v,
                                  clean(s.get("language")),
                                  clean(s.get("coder"))))

    # Affected organisms
    ao = drug_el.find(f"{NP}affected-organisms")
    if ao is not None:
        for o in ao.findall(f"{NP}affected-organism"):
            v = clean(o.text)
            if v:
                rows.append(_attr(did, "affected_organism", v))

    # Food interactions
    fi = drug_el.find(f"{NP}food-interactions")
    if fi is not None:
        for f_ in fi.findall(f"{NP}food-interaction"):
            v = clean(f_.text)
            if v:
                rows.append(_attr(did, "food_interaction", v))

    # Sequences (biotech drugs — FASTA strings)
    seqs = drug_el.find(f"{NP}sequences")
    if seqs is not None:
        for seq in seqs.findall(f"{NP}sequence"):
            v = clean(seq.text)
            if v:
                rows.append(_attr(did, "sequence", v, clean(seq.get("format"))))

    # AHFS codes
    ahfs = drug_el.find(f"{NP}ahfs-codes")
    if ahfs is not None:
        for code in ahfs.findall(f"{NP}ahfs-code"):
            v = clean(code.text)
            if v:
                rows.append(_attr(did, "ahfs_code", v))

    # PDB entries
    pdb = drug_el.find(f"{NP}pdb-entries")
    if pdb is not None:
        for entry in pdb.findall(f"{NP}pdb-entry"):
            v = clean(entry.text)
            if v:
                rows.append(_attr(did, "pdb_entry", v))

    # Classification multi-valued: alternative-parents + substituents
    cls_el = drug_el.find(f"{NP}classification")
    if cls_el is not None:
        for ap in cls_el.findall(f"{NP}alternative-parent"):
            v = clean(ap.text)
            if v:
                rows.append(_attr(did, "classification_alt_parent", v))
        for sub in cls_el.findall(f"{NP}substituent"):
            v = clean(sub.text)
            if v:
                rows.append(_attr(did, "classification_substituent", v))

    return rows


# ── drug_properties ───────────────────────────────────────────────────────────

def _drug_properties(drug_el, primary_id):
    rows = []

    # Calculated properties
    calc = drug_el.find(f"{NP}calculated-properties")
    if calc is not None:
        for prop in calc.findall(f"{NP}property"):
            rows.append({
                "drugbank_id":    primary_id,
                "property_class": "calculated",
                "kind":           t(prop, "kind"),
                "value":          t(prop, "value"),
                "source":         t(prop, "source"),
            })

    # Experimental properties
    exp = drug_el.find(f"{NP}experimental-properties")
    if exp is not None:
        for prop in exp.findall(f"{NP}property"):
            rows.append({
                "drugbank_id":    primary_id,
                "property_class": "experimental",
                "kind":           t(prop, "kind"),
                "value":          t(prop, "value"),
                "source":         t(prop, "source"),
            })

    return rows


# ── external_identifiers (drug-level) ─────────────────────────────────────────

def _external_identifiers(drug_el, primary_id):
    rows = []

    # Cross-database identifiers (UniProtKB, ChEMBL, PubChem, KEGG, etc.)
    ext_ids = drug_el.find(f"{NP}external-identifiers")
    if ext_ids is not None:
        for ei in ext_ids.findall(f"{NP}external-identifier"):
            resource   = t(ei, "resource")
            identifier = t(ei, "identifier")
            if resource and identifier:
                rows.append({
                    "entity_type": "drug",
                    "entity_id":   primary_id,
                    "resource":    resource,
                    "identifier":  identifier,
                })

    # External links (RxList, PDRhealth, Drugs.com) — stored as identifier=url
    ext_links = drug_el.find(f"{NP}external-links")
    if ext_links is not None:
        for lnk in ext_links.findall(f"{NP}external-link"):
            resource = t(lnk, "resource")
            url      = t(lnk, "url")
            if resource and url:
                rows.append({
                    "entity_type": "drug_link",
                    "entity_id":   primary_id,
                    "resource":    resource,
                    "identifier":  url,
                })

    return rows