Spaces:
Sleeping
Sleeping
| """ | |
| parse_pathways.py β extracts pathway and reaction data. | |
| Tables populated: | |
| pathways β deduplicated pathway entities (by smpdb_id) | |
| pathway_members β drugs + enzyme uniprot-ids within pathways (member_type discriminator) | |
| reactions β metabolic reaction records; reaction enzymes serialised as | |
| pipe-delimited triples "drugbank_id|name|uniprot_id|..." | |
| """ | |
| from config import NP | |
| from utils import t, clean | |
| def extract(drug_el, primary_id, state): | |
| new_pathways, pathway_members = _pathways(drug_el, state) | |
| reactions = _reactions(drug_el, primary_id, state) | |
| return { | |
| "pathways": new_pathways, | |
| "pathway_members": pathway_members, | |
| "reactions": reactions, | |
| } | |
| # ββ pathways ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _pathways(drug_el, state): | |
| new_pathway_rows = [] | |
| member_rows = [] | |
| pways = drug_el.find(f"{NP}pathways") | |
| if pways is None: | |
| return new_pathway_rows, member_rows | |
| for pw in pways.findall(f"{NP}pathway"): | |
| smpdb_id = t(pw, "smpdb-id") | |
| if not smpdb_id: | |
| continue | |
| # Write pathway entity only once (deduplicate by smpdb_id) | |
| if smpdb_id not in state.pathways_seen: | |
| state.pathways_seen.add(smpdb_id) | |
| new_pathway_rows.append({ | |
| "smpdb_id": smpdb_id, | |
| "name": t(pw, "name"), | |
| "category": t(pw, "category"), | |
| }) | |
| # pathway_members: drugs listed in this pathway | |
| drugs_el = pw.find(f"{NP}drugs") | |
| if drugs_el is not None: | |
| for d in drugs_el.findall(f"{NP}drug"): | |
| mid = t(d, "drugbank-id") | |
| if mid: | |
| member_rows.append({ | |
| "smpdb_id": smpdb_id, | |
| "member_type": "drug", | |
| "member_id": mid, | |
| "member_name": t(d, "name"), | |
| }) | |
| # pathway_members: enzymes (only uniprot-id; no name in XSD) | |
| enzymes_el = pw.find(f"{NP}enzymes") | |
| if enzymes_el is not None: | |
| for uid_el in enzymes_el.findall(f"{NP}uniprot-id"): | |
| uid = clean(uid_el.text) | |
| if uid: | |
| member_rows.append({ | |
| "smpdb_id": smpdb_id, | |
| "member_type": "enzyme", | |
| "member_id": uid, | |
| "member_name": None, | |
| }) | |
| return new_pathway_rows, member_rows | |
| # ββ reactions βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _reactions(drug_el, primary_id, state): | |
| rows = [] | |
| rxns = drug_el.find(f"{NP}reactions") | |
| if rxns is None: | |
| return rows | |
| for rxn in rxns.findall(f"{NP}reaction"): | |
| # Left / right elements | |
| left = rxn.find(f"{NP}left-element") | |
| right = rxn.find(f"{NP}right-element") | |
| # Serialize enzymes as pipe-delimited triples: "db_id|name|uniprot_id" | |
| enz_parts = [] | |
| enz_el = rxn.find(f"{NP}enzymes") | |
| if enz_el is not None: | |
| for enz in enz_el.findall(f"{NP}enzyme"): | |
| eid = t(enz, "drugbank-id") or "" | |
| ename = t(enz, "name") or "" | |
| euid = t(enz, "uniprot-id") or "" | |
| enz_parts.append(f"{eid}|{ename}|{euid}") | |
| state.reaction_counter += 1 | |
| rows.append({ | |
| "reaction_id": state.reaction_counter, | |
| "drugbank_id": primary_id, | |
| "sequence": t(rxn, "sequence"), | |
| "left_element_id": t(left, "drugbank-id") if left is not None else None, | |
| "left_element_name": t(left, "name") if left is not None else None, | |
| "right_element_id": t(right, "drugbank-id") if right is not None else None, | |
| "right_element_name": t(right, "name") if right is not None else None, | |
| "enzymes": "||".join(enz_parts) if enz_parts else None, | |
| }) | |
| return rows | |