Spaces:
Sleeping
Sleeping
File size: 4,350 Bytes
a062f28 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 | """
parse_pathways.py β extracts pathway and reaction data.
Tables populated:
pathways β deduplicated pathway entities (by smpdb_id)
pathway_members β drugs + enzyme uniprot-ids within pathways (member_type discriminator)
reactions β metabolic reaction records; reaction enzymes serialised as
pipe-delimited triples "drugbank_id|name|uniprot_id|..."
"""
from config import NP
from utils import t, clean
def extract(drug_el, primary_id, state):
new_pathways, pathway_members = _pathways(drug_el, state)
reactions = _reactions(drug_el, primary_id, state)
return {
"pathways": new_pathways,
"pathway_members": pathway_members,
"reactions": reactions,
}
# ββ pathways ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def _pathways(drug_el, state):
new_pathway_rows = []
member_rows = []
pways = drug_el.find(f"{NP}pathways")
if pways is None:
return new_pathway_rows, member_rows
for pw in pways.findall(f"{NP}pathway"):
smpdb_id = t(pw, "smpdb-id")
if not smpdb_id:
continue
# Write pathway entity only once (deduplicate by smpdb_id)
if smpdb_id not in state.pathways_seen:
state.pathways_seen.add(smpdb_id)
new_pathway_rows.append({
"smpdb_id": smpdb_id,
"name": t(pw, "name"),
"category": t(pw, "category"),
})
# pathway_members: drugs listed in this pathway
drugs_el = pw.find(f"{NP}drugs")
if drugs_el is not None:
for d in drugs_el.findall(f"{NP}drug"):
mid = t(d, "drugbank-id")
if mid:
member_rows.append({
"smpdb_id": smpdb_id,
"member_type": "drug",
"member_id": mid,
"member_name": t(d, "name"),
})
# pathway_members: enzymes (only uniprot-id; no name in XSD)
enzymes_el = pw.find(f"{NP}enzymes")
if enzymes_el is not None:
for uid_el in enzymes_el.findall(f"{NP}uniprot-id"):
uid = clean(uid_el.text)
if uid:
member_rows.append({
"smpdb_id": smpdb_id,
"member_type": "enzyme",
"member_id": uid,
"member_name": None,
})
return new_pathway_rows, member_rows
# ββ reactions βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def _reactions(drug_el, primary_id, state):
rows = []
rxns = drug_el.find(f"{NP}reactions")
if rxns is None:
return rows
for rxn in rxns.findall(f"{NP}reaction"):
# Left / right elements
left = rxn.find(f"{NP}left-element")
right = rxn.find(f"{NP}right-element")
# Serialize enzymes as pipe-delimited triples: "db_id|name|uniprot_id"
enz_parts = []
enz_el = rxn.find(f"{NP}enzymes")
if enz_el is not None:
for enz in enz_el.findall(f"{NP}enzyme"):
eid = t(enz, "drugbank-id") or ""
ename = t(enz, "name") or ""
euid = t(enz, "uniprot-id") or ""
enz_parts.append(f"{eid}|{ename}|{euid}")
state.reaction_counter += 1
rows.append({
"reaction_id": state.reaction_counter,
"drugbank_id": primary_id,
"sequence": t(rxn, "sequence"),
"left_element_id": t(left, "drugbank-id") if left is not None else None,
"left_element_name": t(left, "name") if left is not None else None,
"right_element_id": t(right, "drugbank-id") if right is not None else None,
"right_element_name": t(right, "name") if right is not None else None,
"enzymes": "||".join(enz_parts) if enz_parts else None,
})
return rows
|