timmers commited on
Commit
8ef444f
·
verified ·
1 Parent(s): 68cce5a

Upload rare_disease_kg.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. rare_disease_kg.py +160 -0
rare_disease_kg.py ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Shared heterogeneous rare-disease knowledge graph for Pillars A & C.
2
+
3
+ Loads the full PrimeKG-derived graph if it is present locally (the production
4
+ path), otherwise falls back to a compact, biologically-real fixture covering the
5
+ GEMEO/SUS cohort so the pillars run out-of-the-box with `python pillar_a_*.py`.
6
+
7
+ The fixture encodes *real* rare-disease biology (causal genes, key phenotypes,
8
+ gene–gene interactions, mechanistically-related diseases). It is small on
9
+ purpose — enough to demonstrate the algorithms and reproduce Marfan→FBN1 — and
10
+ is transparently replaced by PrimeKG when `GEMEO_KG_DIR` (or the default
11
+ ~/rarasnet-swarm-py/gemeo/data) contains hetero_graph.json + node_ids.json.
12
+
13
+ Node id convention: "ORPHA:<n>" diseases, "HGNC:<sym>" genes, "HP:<n>" phenotypes.
14
+ """
15
+ from __future__ import annotations
16
+ import json, os
17
+ from dataclasses import dataclass, field
18
+
19
+ DEFAULT_KG_DIR = os.environ.get(
20
+ "GEMEO_KG_DIR", os.path.expanduser("~/rarasnet-swarm-py/gemeo/data"))
21
+
22
+
23
+ # ── Compact biologically-real fixture (ORPHA / HGNC / HP) ────────────────────
24
+ # disease -> {name, genes:[(gene, "causal"|"modifier")], phenotypes:[hp...]}
25
+ _DISEASES = {
26
+ "ORPHA:558": ("Marfan syndrome", [("FBN1", "causal"), ("TGFBR2", "modifier")],
27
+ ["HP:0004942", "HP:0001166", "HP:0000545", "HP:0001763"]), # aortic dilat, arachnodactyly, myopia, pes planus
28
+ "ORPHA:98896":("Duchenne muscular dystrophy",[("DMD", "causal")],
29
+ ["HP:0003560", "HP:0003707", "HP:0001644", "HP:0002650"]),
30
+ "ORPHA:70": ("Spinal muscular atrophy 1", [("SMN1", "causal"), ("SMN2", "modifier")],
31
+ ["HP:0003457", "HP:0001324", "HP:0002093"]),
32
+ "ORPHA:586": ("Cystic fibrosis", [("CFTR", "causal")],
33
+ ["HP:0006538", "HP:0002202", "HP:0001733", "HP:0002595"]),
34
+ "ORPHA:355": ("Gaucher disease", [("GBA", "causal")],
35
+ ["HP:0001433", "HP:0001744", "HP:0001873", "HP:0002240"]),
36
+ "ORPHA:579": ("Mucopolysaccharidosis I", [("IDUA", "causal")],
37
+ ["HP:0001407", "HP:0000256", "HP:0001182"]),
38
+ "ORPHA:580": ("Mucopolysaccharidosis II", [("IDS", "causal")],
39
+ ["HP:0001407", "HP:0000256", "HP:0001371"]),
40
+ "ORPHA:905": ("Wilson disease", [("ATP7B", "causal")],
41
+ ["HP:0001399", "HP:0002171", "HP:0001337"]),
42
+ "ORPHA:95": ("Friedreich ataxia", [("FXN", "causal")],
43
+ ["HP:0002066", "HP:0001251", "HP:0001638"]),
44
+ "ORPHA:636": ("Neurofibromatosis type 1", [("NF1", "causal")],
45
+ ["HP:0009732", "HP:0000957", "HP:0009735"]),
46
+ "ORPHA:778": ("Rett syndrome", [("MECP2", "causal")],
47
+ ["HP:0001250", "HP:0000252", "HP:0001260"]),
48
+ "ORPHA:100": ("Ataxia-telangiectasia", [("ATM", "causal")],
49
+ ["HP:0001251", "HP:0001009", "HP:0002205", "HP:0002584"]),
50
+ }
51
+ # gene–gene interactions (PPI / pathway) — enables 2-hop guilt-by-association
52
+ _GENE_GENE = [("FBN1", "TGFBR2"), ("FBN1", "TGFBR1"), ("SMN1", "SMN2"),
53
+ ("ATM", "TP53"), ("GBA", "SNCA"), ("DMD", "UTRN")]
54
+ # mechanistically-related diseases (storage disorders, neuromuscular, …)
55
+ _DIS_DIS = [("ORPHA:579", "ORPHA:580"), ("ORPHA:355", "ORPHA:579"),
56
+ ("ORPHA:70", "ORPHA:98896"), ("ORPHA:95", "ORPHA:100")]
57
+ _HP_NAME = { # a few labels for readable evidence paths
58
+ "HP:0004942": "ascending aortic aneurysm", "HP:0001166": "arachnodactyly",
59
+ "HP:0003560": "muscular dystrophy", "HP:0003457": "EMG: neuropathic changes",
60
+ "HP:0006538": "recurrent pneumonia", "HP:0001433": "hepatosplenomegaly",
61
+ "HP:0002066": "gait ataxia", "HP:0001251": "ataxia", "HP:0001250": "seizure",
62
+ }
63
+
64
+
65
+ @dataclass
66
+ class KG:
67
+ """Heterogeneous KG with typed adjacency. Nodes are typed string ids."""
68
+ nodes: list # ordered node ids
69
+ idx: dict # node id -> int index
70
+ ntype: dict # node id -> "disease"|"gene"|"phenotype"
71
+ adj: dict = field(default_factory=dict) # node id -> set(neighbor ids)
72
+ edge_label: dict = field(default_factory=dict) # (u,v) -> relation label
73
+ names: dict = field(default_factory=dict) # node id -> human label
74
+
75
+ def neighbors(self, u):
76
+ return self.adj.get(u, set())
77
+
78
+ def of_type(self, t):
79
+ return [n for n in self.nodes if self.ntype[n] == t]
80
+
81
+
82
+ def _add_edge(adj, lab, u, v, relation):
83
+ adj.setdefault(u, set()).add(v)
84
+ adj.setdefault(v, set()).add(u)
85
+ lab[(u, v)] = relation
86
+ lab[(v, u)] = relation
87
+
88
+
89
+ def _build_fixture() -> KG:
90
+ nodes, ntype, names = [], {}, {}
91
+ adj, lab = {}, {}
92
+ def ensure(nid, t, nm=None):
93
+ if nid not in ntype:
94
+ ntype[nid] = t; nodes.append(nid)
95
+ if nm: names[nid] = nm
96
+ for did, (dname, genes, phens) in _DISEASES.items():
97
+ ensure(did, "disease", dname)
98
+ for g, kind in genes:
99
+ ensure(g, "gene", g)
100
+ _add_edge(adj, lab, did, g, f"associated_with:{kind}")
101
+ for hp in phens:
102
+ ensure(hp, "phenotype", _HP_NAME.get(hp, hp))
103
+ _add_edge(adj, lab, did, hp, "has_phenotype")
104
+ for g1, g2 in _GENE_GENE:
105
+ ensure(g1, "gene", g1); ensure(g2, "gene", g2)
106
+ _add_edge(adj, lab, g1, g2, "interacts_with")
107
+ for d1, d2 in _DIS_DIS:
108
+ _add_edge(adj, lab, d1, d2, "related_to")
109
+ idx = {n: i for i, n in enumerate(nodes)}
110
+ return KG(nodes=nodes, idx=idx, ntype=ntype, adj=adj, edge_label=lab, names=names)
111
+
112
+
113
+ def _build_from_primekg(kg_dir: str) -> KG:
114
+ """Load the production PrimeKG-derived graph (hetero_graph.json + node_ids.json)."""
115
+ g = json.load(open(f"{kg_dir}/hetero_graph.json"))
116
+ node_ids = json.load(open(f"{kg_dir}/node_ids.json"))
117
+ nodes, ntype, names = [], {}, {}
118
+ adj, lab = {}, {}
119
+ row2id = {t: {int(k): (f"{ '' if t=='disease' else ''}{v}") for k, v in m.items()}
120
+ for t, m in node_ids.items()}
121
+ # canonical typed ids
122
+ def cid(t, raw):
123
+ pfx = {"disease": "ORPHA:", "gene": "HGNC:", "phenotype": "HP:"}.get(t, t + ":")
124
+ s = str(raw)
125
+ return s if s.startswith(pfx) else pfx + s
126
+ for t, m in row2id.items():
127
+ for _, raw in m.items():
128
+ nid = cid(t, raw)
129
+ if nid not in ntype:
130
+ ntype[nid] = t; nodes.append(nid)
131
+ REL = {"disease__associated_with__gene": ("disease", "gene", "associated_with"),
132
+ "disease__has_phenotype__phenotype": ("disease", "phenotype", "has_phenotype")}
133
+ for et, e in g.get("edges", {}).items():
134
+ if et not in REL:
135
+ continue
136
+ st, dt, rel = REL[et]
137
+ for s, d in zip(e["src"], e["dst"]):
138
+ u = cid(st, row2id[st].get(int(s), s)); v = cid(dt, row2id[dt].get(int(d), d))
139
+ if u in ntype and v in ntype:
140
+ _add_edge(adj, lab, u, v, rel)
141
+ idx = {n: i for i, n in enumerate(nodes)}
142
+ return KG(nodes=nodes, idx=idx, ntype=ntype, adj=adj, edge_label=lab, names=names)
143
+
144
+
145
+ def load_kg(kg_dir: str | None = None) -> tuple[KG, str]:
146
+ """Return (KG, source). Prefer PrimeKG on disk; else the built-in fixture."""
147
+ kg_dir = kg_dir or DEFAULT_KG_DIR
148
+ try:
149
+ if os.path.exists(f"{kg_dir}/hetero_graph.json"):
150
+ return _build_from_primekg(kg_dir), "primekg"
151
+ except Exception as e: # pragma: no cover
152
+ print(f"[kg] PrimeKG load failed ({e}); using fixture")
153
+ return _build_fixture(), "fixture"
154
+
155
+
156
+ if __name__ == "__main__":
157
+ kg, src = load_kg()
158
+ print(f"KG source: {src} | {len(kg.nodes)} nodes "
159
+ f"(diseases={len(kg.of_type('disease'))}, genes={len(kg.of_type('gene'))}, "
160
+ f"phenotypes={len(kg.of_type('phenotype'))})")