Spaces:

marwadeeb
/

ddi-checker

Sleeping

App Files Files Community

ddi-checker / parser /parse_core.py

marwadeeb

added data parsing stage

a062f28 2 months ago

raw

history blame contribute delete

10 kB

	"""
	parse_core.py — extracts core drug data.

	Tables populated:
	drugs — one row per drug (all scalar fields + inlined classification)
	drug_ids — all DrugBank IDs (primary + secondary/legacy)
	drug_attributes — multi-valued string lists (groups, synonyms, organisms,
	food_interactions, sequences, ahfs_codes, pdb_entries,
	classification alt_parents, substituents)
	drug_properties — calculated + experimental properties (merged)
	external_identifiers — drug-level cross-database IDs and external links
	"""
	from config import NP
	from utils import t, a, clean


	def extract(drug_el, primary_id, state):
	"""Return dict[table_name -> list[row_dict]]."""
	return {
	"drugs": _drugs(drug_el, primary_id),
	"drug_ids": _drug_ids(drug_el, primary_id),
	"drug_attributes": _drug_attributes(drug_el, primary_id),
	"drug_properties": _drug_properties(drug_el, primary_id),
	"external_identifiers": _external_identifiers(drug_el, primary_id),
	}


	# ── drugs ─────────────────────────────────────────────────────────────────────

	def _drugs(drug_el, primary_id):
	cls_el = drug_el.find(f"{NP}classification")
	cls = {}
	if cls_el is not None:
	cls = {
	"classification_description": t(cls_el, "description"),
	"classification_direct_parent": t(cls_el, "direct-parent"),
	"classification_kingdom": t(cls_el, "kingdom"),
	"classification_superclass": t(cls_el, "superclass"),
	"classification_class": t(cls_el, "class"),
	"classification_subclass": t(cls_el, "subclass"),
	}

	row = {
	"drugbank_id": primary_id,
	"name": t(drug_el, "name"),
	"drug_type": drug_el.get("type"),
	"description": t(drug_el, "description"),
	"cas_number": t(drug_el, "cas-number"),
	"unii": t(drug_el, "unii"),
	"average_mass": t(drug_el, "average-mass"),
	"monoisotopic_mass": t(drug_el, "monoisotopic-mass"),
	"state": t(drug_el, "state"),
	"indication": t(drug_el, "indication"),
	"pharmacodynamics": t(drug_el, "pharmacodynamics"),
	"mechanism_of_action": t(drug_el, "mechanism-of-action"),
	"toxicity": t(drug_el, "toxicity"),
	"metabolism": t(drug_el, "metabolism"),
	"absorption": t(drug_el, "absorption"),
	"half_life": t(drug_el, "half-life"),
	"protein_binding": t(drug_el, "protein-binding"),
	"route_of_elimination": t(drug_el, "route-of-elimination"),
	"volume_of_distribution": t(drug_el, "volume-of-distribution"),
	"clearance": t(drug_el, "clearance"),
	"synthesis_reference": t(drug_el, "synthesis-reference"),
	"fda_label_url": t(drug_el, "fda-label"),
	"msds_url": t(drug_el, "msds"),
	"created_date": drug_el.get("created"),
	"updated_date": drug_el.get("updated"),
	# Classification scalars (empty dict means all will be None)
	"classification_description": cls.get("classification_description"),
	"classification_direct_parent": cls.get("classification_direct_parent"),
	"classification_kingdom": cls.get("classification_kingdom"),
	"classification_superclass": cls.get("classification_superclass"),
	"classification_class": cls.get("classification_class"),
	"classification_subclass": cls.get("classification_subclass"),
	}
	return [row]


	# ── drug_ids ──────────────────────────────────────────────────────────────────

	def _drug_ids(drug_el, primary_id):
	rows = []
	for id_el in drug_el.findall(f"{NP}drugbank-id"):
	val = clean(id_el.text)
	if val:
	rows.append({
	"drugbank_id": primary_id,
	"legacy_id": val,
	"is_primary": id_el.get("primary", "false").lower() == "true",
	})
	return rows


	# ── drug_attributes ───────────────────────────────────────────────────────────

	def _attr(did, atype, value, v2=None, v3=None):
	return {"drugbank_id": did, "attr_type": atype,
	"value": value, "value2": v2, "value3": v3}


	def _drug_attributes(drug_el, primary_id):
	rows = []
	did = primary_id

	# Groups
	grps = drug_el.find(f"{NP}groups")
	if grps is not None:
	for g in grps.findall(f"{NP}group"):
	v = clean(g.text)
	if v:
	rows.append(_attr(did, "group", v))

	# Synonyms (with language + coder attributes)
	syns = drug_el.find(f"{NP}synonyms")
	if syns is not None:
	for s in syns.findall(f"{NP}synonym"):
	v = clean(s.text)
	if v:
	rows.append(_attr(did, "synonym", v,
	clean(s.get("language")),
	clean(s.get("coder"))))

	# Affected organisms
	ao = drug_el.find(f"{NP}affected-organisms")
	if ao is not None:
	for o in ao.findall(f"{NP}affected-organism"):
	v = clean(o.text)
	if v:
	rows.append(_attr(did, "affected_organism", v))

	# Food interactions
	fi = drug_el.find(f"{NP}food-interactions")
	if fi is not None:
	for f_ in fi.findall(f"{NP}food-interaction"):
	v = clean(f_.text)
	if v:
	rows.append(_attr(did, "food_interaction", v))

	# Sequences (biotech drugs — FASTA strings)
	seqs = drug_el.find(f"{NP}sequences")
	if seqs is not None:
	for seq in seqs.findall(f"{NP}sequence"):
	v = clean(seq.text)
	if v:
	rows.append(_attr(did, "sequence", v, clean(seq.get("format"))))

	# AHFS codes
	ahfs = drug_el.find(f"{NP}ahfs-codes")
	if ahfs is not None:
	for code in ahfs.findall(f"{NP}ahfs-code"):
	v = clean(code.text)
	if v:
	rows.append(_attr(did, "ahfs_code", v))

	# PDB entries
	pdb = drug_el.find(f"{NP}pdb-entries")
	if pdb is not None:
	for entry in pdb.findall(f"{NP}pdb-entry"):
	v = clean(entry.text)
	if v:
	rows.append(_attr(did, "pdb_entry", v))

	# Classification multi-valued: alternative-parents + substituents
	cls_el = drug_el.find(f"{NP}classification")
	if cls_el is not None:
	for ap in cls_el.findall(f"{NP}alternative-parent"):
	v = clean(ap.text)
	if v:
	rows.append(_attr(did, "classification_alt_parent", v))
	for sub in cls_el.findall(f"{NP}substituent"):
	v = clean(sub.text)
	if v:
	rows.append(_attr(did, "classification_substituent", v))

	return rows


	# ── drug_properties ───────────────────────────────────────────────────────────

	def _drug_properties(drug_el, primary_id):
	rows = []

	# Calculated properties
	calc = drug_el.find(f"{NP}calculated-properties")
	if calc is not None:
	for prop in calc.findall(f"{NP}property"):
	rows.append({
	"drugbank_id": primary_id,
	"property_class": "calculated",
	"kind": t(prop, "kind"),
	"value": t(prop, "value"),
	"source": t(prop, "source"),
	})

	# Experimental properties
	exp = drug_el.find(f"{NP}experimental-properties")
	if exp is not None:
	for prop in exp.findall(f"{NP}property"):
	rows.append({
	"drugbank_id": primary_id,
	"property_class": "experimental",
	"kind": t(prop, "kind"),
	"value": t(prop, "value"),
	"source": t(prop, "source"),
	})

	return rows


	# ── external_identifiers (drug-level) ─────────────────────────────────────────

	def _external_identifiers(drug_el, primary_id):
	rows = []

	# Cross-database identifiers (UniProtKB, ChEMBL, PubChem, KEGG, etc.)
	ext_ids = drug_el.find(f"{NP}external-identifiers")
	if ext_ids is not None:
	for ei in ext_ids.findall(f"{NP}external-identifier"):
	resource = t(ei, "resource")
	identifier = t(ei, "identifier")
	if resource and identifier:
	rows.append({
	"entity_type": "drug",
	"entity_id": primary_id,
	"resource": resource,
	"identifier": identifier,
	})

	# External links (RxList, PDRhealth, Drugs.com) — stored as identifier=url
	ext_links = drug_el.find(f"{NP}external-links")
	if ext_links is not None:
	for lnk in ext_links.findall(f"{NP}external-link"):
	resource = t(lnk, "resource")
	url = t(lnk, "url")
	if resource and url:
	rows.append({
	"entity_type": "drug_link",
	"entity_id": primary_id,
	"resource": resource,
	"identifier": url,
	})

	return rows