File size: 4,350 Bytes
a062f28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
"""
parse_pathways.py β€” extracts pathway and reaction data.

Tables populated:
  pathways        β€” deduplicated pathway entities (by smpdb_id)
  pathway_members β€” drugs + enzyme uniprot-ids within pathways (member_type discriminator)
  reactions       β€” metabolic reaction records; reaction enzymes serialised as
                    pipe-delimited triples  "drugbank_id|name|uniprot_id|..."
"""
from config import NP
from utils import t, clean


def extract(drug_el, primary_id, state):
    new_pathways, pathway_members = _pathways(drug_el, state)
    reactions = _reactions(drug_el, primary_id, state)
    return {
        "pathways":        new_pathways,
        "pathway_members": pathway_members,
        "reactions":       reactions,
    }


# ── pathways ──────────────────────────────────────────────────────────────────

def _pathways(drug_el, state):
    new_pathway_rows = []
    member_rows = []

    pways = drug_el.find(f"{NP}pathways")
    if pways is None:
        return new_pathway_rows, member_rows

    for pw in pways.findall(f"{NP}pathway"):
        smpdb_id = t(pw, "smpdb-id")
        if not smpdb_id:
            continue

        # Write pathway entity only once (deduplicate by smpdb_id)
        if smpdb_id not in state.pathways_seen:
            state.pathways_seen.add(smpdb_id)
            new_pathway_rows.append({
                "smpdb_id": smpdb_id,
                "name":     t(pw, "name"),
                "category": t(pw, "category"),
            })

        # pathway_members: drugs listed in this pathway
        drugs_el = pw.find(f"{NP}drugs")
        if drugs_el is not None:
            for d in drugs_el.findall(f"{NP}drug"):
                mid = t(d, "drugbank-id")
                if mid:
                    member_rows.append({
                        "smpdb_id":   smpdb_id,
                        "member_type": "drug",
                        "member_id":   mid,
                        "member_name": t(d, "name"),
                    })

        # pathway_members: enzymes (only uniprot-id; no name in XSD)
        enzymes_el = pw.find(f"{NP}enzymes")
        if enzymes_el is not None:
            for uid_el in enzymes_el.findall(f"{NP}uniprot-id"):
                uid = clean(uid_el.text)
                if uid:
                    member_rows.append({
                        "smpdb_id":   smpdb_id,
                        "member_type": "enzyme",
                        "member_id":   uid,
                        "member_name": None,
                    })

    return new_pathway_rows, member_rows


# ── reactions ─────────────────────────────────────────────────────────────────

def _reactions(drug_el, primary_id, state):
    rows = []
    rxns = drug_el.find(f"{NP}reactions")
    if rxns is None:
        return rows

    for rxn in rxns.findall(f"{NP}reaction"):
        # Left / right elements
        left  = rxn.find(f"{NP}left-element")
        right = rxn.find(f"{NP}right-element")

        # Serialize enzymes as pipe-delimited triples: "db_id|name|uniprot_id"
        enz_parts = []
        enz_el = rxn.find(f"{NP}enzymes")
        if enz_el is not None:
            for enz in enz_el.findall(f"{NP}enzyme"):
                eid   = t(enz, "drugbank-id") or ""
                ename = t(enz, "name") or ""
                euid  = t(enz, "uniprot-id") or ""
                enz_parts.append(f"{eid}|{ename}|{euid}")

        state.reaction_counter += 1
        rows.append({
            "reaction_id":       state.reaction_counter,
            "drugbank_id":       primary_id,
            "sequence":          t(rxn, "sequence"),
            "left_element_id":   t(left, "drugbank-id") if left is not None else None,
            "left_element_name": t(left, "name")        if left is not None else None,
            "right_element_id":  t(right, "drugbank-id") if right is not None else None,
            "right_element_name": t(right, "name")       if right is not None else None,
            "enzymes":           "||".join(enz_parts) if enz_parts else None,
        })

    return rows