Spaces:
Sleeping
Sleeping
| """ | |
| parse_core.py β extracts core drug data. | |
| Tables populated: | |
| drugs β one row per drug (all scalar fields + inlined classification) | |
| drug_ids β all DrugBank IDs (primary + secondary/legacy) | |
| drug_attributes β multi-valued string lists (groups, synonyms, organisms, | |
| food_interactions, sequences, ahfs_codes, pdb_entries, | |
| classification alt_parents, substituents) | |
| drug_properties β calculated + experimental properties (merged) | |
| external_identifiers β drug-level cross-database IDs and external links | |
| """ | |
| from config import NP | |
| from utils import t, a, clean | |
| def extract(drug_el, primary_id, state): | |
| """Return dict[table_name -> list[row_dict]].""" | |
| return { | |
| "drugs": _drugs(drug_el, primary_id), | |
| "drug_ids": _drug_ids(drug_el, primary_id), | |
| "drug_attributes": _drug_attributes(drug_el, primary_id), | |
| "drug_properties": _drug_properties(drug_el, primary_id), | |
| "external_identifiers": _external_identifiers(drug_el, primary_id), | |
| } | |
| # ββ drugs βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _drugs(drug_el, primary_id): | |
| cls_el = drug_el.find(f"{NP}classification") | |
| cls = {} | |
| if cls_el is not None: | |
| cls = { | |
| "classification_description": t(cls_el, "description"), | |
| "classification_direct_parent": t(cls_el, "direct-parent"), | |
| "classification_kingdom": t(cls_el, "kingdom"), | |
| "classification_superclass": t(cls_el, "superclass"), | |
| "classification_class": t(cls_el, "class"), | |
| "classification_subclass": t(cls_el, "subclass"), | |
| } | |
| row = { | |
| "drugbank_id": primary_id, | |
| "name": t(drug_el, "name"), | |
| "drug_type": drug_el.get("type"), | |
| "description": t(drug_el, "description"), | |
| "cas_number": t(drug_el, "cas-number"), | |
| "unii": t(drug_el, "unii"), | |
| "average_mass": t(drug_el, "average-mass"), | |
| "monoisotopic_mass": t(drug_el, "monoisotopic-mass"), | |
| "state": t(drug_el, "state"), | |
| "indication": t(drug_el, "indication"), | |
| "pharmacodynamics": t(drug_el, "pharmacodynamics"), | |
| "mechanism_of_action": t(drug_el, "mechanism-of-action"), | |
| "toxicity": t(drug_el, "toxicity"), | |
| "metabolism": t(drug_el, "metabolism"), | |
| "absorption": t(drug_el, "absorption"), | |
| "half_life": t(drug_el, "half-life"), | |
| "protein_binding": t(drug_el, "protein-binding"), | |
| "route_of_elimination": t(drug_el, "route-of-elimination"), | |
| "volume_of_distribution": t(drug_el, "volume-of-distribution"), | |
| "clearance": t(drug_el, "clearance"), | |
| "synthesis_reference": t(drug_el, "synthesis-reference"), | |
| "fda_label_url": t(drug_el, "fda-label"), | |
| "msds_url": t(drug_el, "msds"), | |
| "created_date": drug_el.get("created"), | |
| "updated_date": drug_el.get("updated"), | |
| # Classification scalars (empty dict means all will be None) | |
| "classification_description": cls.get("classification_description"), | |
| "classification_direct_parent": cls.get("classification_direct_parent"), | |
| "classification_kingdom": cls.get("classification_kingdom"), | |
| "classification_superclass": cls.get("classification_superclass"), | |
| "classification_class": cls.get("classification_class"), | |
| "classification_subclass": cls.get("classification_subclass"), | |
| } | |
| return [row] | |
| # ββ drug_ids ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _drug_ids(drug_el, primary_id): | |
| rows = [] | |
| for id_el in drug_el.findall(f"{NP}drugbank-id"): | |
| val = clean(id_el.text) | |
| if val: | |
| rows.append({ | |
| "drugbank_id": primary_id, | |
| "legacy_id": val, | |
| "is_primary": id_el.get("primary", "false").lower() == "true", | |
| }) | |
| return rows | |
| # ββ drug_attributes βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _attr(did, atype, value, v2=None, v3=None): | |
| return {"drugbank_id": did, "attr_type": atype, | |
| "value": value, "value2": v2, "value3": v3} | |
| def _drug_attributes(drug_el, primary_id): | |
| rows = [] | |
| did = primary_id | |
| # Groups | |
| grps = drug_el.find(f"{NP}groups") | |
| if grps is not None: | |
| for g in grps.findall(f"{NP}group"): | |
| v = clean(g.text) | |
| if v: | |
| rows.append(_attr(did, "group", v)) | |
| # Synonyms (with language + coder attributes) | |
| syns = drug_el.find(f"{NP}synonyms") | |
| if syns is not None: | |
| for s in syns.findall(f"{NP}synonym"): | |
| v = clean(s.text) | |
| if v: | |
| rows.append(_attr(did, "synonym", v, | |
| clean(s.get("language")), | |
| clean(s.get("coder")))) | |
| # Affected organisms | |
| ao = drug_el.find(f"{NP}affected-organisms") | |
| if ao is not None: | |
| for o in ao.findall(f"{NP}affected-organism"): | |
| v = clean(o.text) | |
| if v: | |
| rows.append(_attr(did, "affected_organism", v)) | |
| # Food interactions | |
| fi = drug_el.find(f"{NP}food-interactions") | |
| if fi is not None: | |
| for f_ in fi.findall(f"{NP}food-interaction"): | |
| v = clean(f_.text) | |
| if v: | |
| rows.append(_attr(did, "food_interaction", v)) | |
| # Sequences (biotech drugs β FASTA strings) | |
| seqs = drug_el.find(f"{NP}sequences") | |
| if seqs is not None: | |
| for seq in seqs.findall(f"{NP}sequence"): | |
| v = clean(seq.text) | |
| if v: | |
| rows.append(_attr(did, "sequence", v, clean(seq.get("format")))) | |
| # AHFS codes | |
| ahfs = drug_el.find(f"{NP}ahfs-codes") | |
| if ahfs is not None: | |
| for code in ahfs.findall(f"{NP}ahfs-code"): | |
| v = clean(code.text) | |
| if v: | |
| rows.append(_attr(did, "ahfs_code", v)) | |
| # PDB entries | |
| pdb = drug_el.find(f"{NP}pdb-entries") | |
| if pdb is not None: | |
| for entry in pdb.findall(f"{NP}pdb-entry"): | |
| v = clean(entry.text) | |
| if v: | |
| rows.append(_attr(did, "pdb_entry", v)) | |
| # Classification multi-valued: alternative-parents + substituents | |
| cls_el = drug_el.find(f"{NP}classification") | |
| if cls_el is not None: | |
| for ap in cls_el.findall(f"{NP}alternative-parent"): | |
| v = clean(ap.text) | |
| if v: | |
| rows.append(_attr(did, "classification_alt_parent", v)) | |
| for sub in cls_el.findall(f"{NP}substituent"): | |
| v = clean(sub.text) | |
| if v: | |
| rows.append(_attr(did, "classification_substituent", v)) | |
| return rows | |
| # ββ drug_properties βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _drug_properties(drug_el, primary_id): | |
| rows = [] | |
| # Calculated properties | |
| calc = drug_el.find(f"{NP}calculated-properties") | |
| if calc is not None: | |
| for prop in calc.findall(f"{NP}property"): | |
| rows.append({ | |
| "drugbank_id": primary_id, | |
| "property_class": "calculated", | |
| "kind": t(prop, "kind"), | |
| "value": t(prop, "value"), | |
| "source": t(prop, "source"), | |
| }) | |
| # Experimental properties | |
| exp = drug_el.find(f"{NP}experimental-properties") | |
| if exp is not None: | |
| for prop in exp.findall(f"{NP}property"): | |
| rows.append({ | |
| "drugbank_id": primary_id, | |
| "property_class": "experimental", | |
| "kind": t(prop, "kind"), | |
| "value": t(prop, "value"), | |
| "source": t(prop, "source"), | |
| }) | |
| return rows | |
| # ββ external_identifiers (drug-level) βββββββββββββββββββββββββββββββββββββββββ | |
| def _external_identifiers(drug_el, primary_id): | |
| rows = [] | |
| # Cross-database identifiers (UniProtKB, ChEMBL, PubChem, KEGG, etc.) | |
| ext_ids = drug_el.find(f"{NP}external-identifiers") | |
| if ext_ids is not None: | |
| for ei in ext_ids.findall(f"{NP}external-identifier"): | |
| resource = t(ei, "resource") | |
| identifier = t(ei, "identifier") | |
| if resource and identifier: | |
| rows.append({ | |
| "entity_type": "drug", | |
| "entity_id": primary_id, | |
| "resource": resource, | |
| "identifier": identifier, | |
| }) | |
| # External links (RxList, PDRhealth, Drugs.com) β stored as identifier=url | |
| ext_links = drug_el.find(f"{NP}external-links") | |
| if ext_links is not None: | |
| for lnk in ext_links.findall(f"{NP}external-link"): | |
| resource = t(lnk, "resource") | |
| url = t(lnk, "url") | |
| if resource and url: | |
| rows.append({ | |
| "entity_type": "drug_link", | |
| "entity_id": primary_id, | |
| "resource": resource, | |
| "identifier": url, | |
| }) | |
| return rows | |