Upload rare_disease_kg.py with huggingface_hub
Browse files- rare_disease_kg.py +160 -0
rare_disease_kg.py
ADDED
|
@@ -0,0 +1,160 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Shared heterogeneous rare-disease knowledge graph for Pillars A & C.
|
| 2 |
+
|
| 3 |
+
Loads the full PrimeKG-derived graph if it is present locally (the production
|
| 4 |
+
path), otherwise falls back to a compact, biologically-real fixture covering the
|
| 5 |
+
GEMEO/SUS cohort so the pillars run out-of-the-box with `python pillar_a_*.py`.
|
| 6 |
+
|
| 7 |
+
The fixture encodes *real* rare-disease biology (causal genes, key phenotypes,
|
| 8 |
+
gene–gene interactions, mechanistically-related diseases). It is small on
|
| 9 |
+
purpose — enough to demonstrate the algorithms and reproduce Marfan→FBN1 — and
|
| 10 |
+
is transparently replaced by PrimeKG when `GEMEO_KG_DIR` (or the default
|
| 11 |
+
~/rarasnet-swarm-py/gemeo/data) contains hetero_graph.json + node_ids.json.
|
| 12 |
+
|
| 13 |
+
Node id convention: "ORPHA:<n>" diseases, "HGNC:<sym>" genes, "HP:<n>" phenotypes.
|
| 14 |
+
"""
|
| 15 |
+
from __future__ import annotations
|
| 16 |
+
import json, os
|
| 17 |
+
from dataclasses import dataclass, field
|
| 18 |
+
|
| 19 |
+
DEFAULT_KG_DIR = os.environ.get(
|
| 20 |
+
"GEMEO_KG_DIR", os.path.expanduser("~/rarasnet-swarm-py/gemeo/data"))
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
# ── Compact biologically-real fixture (ORPHA / HGNC / HP) ────────────────────
|
| 24 |
+
# disease -> {name, genes:[(gene, "causal"|"modifier")], phenotypes:[hp...]}
|
| 25 |
+
_DISEASES = {
|
| 26 |
+
"ORPHA:558": ("Marfan syndrome", [("FBN1", "causal"), ("TGFBR2", "modifier")],
|
| 27 |
+
["HP:0004942", "HP:0001166", "HP:0000545", "HP:0001763"]), # aortic dilat, arachnodactyly, myopia, pes planus
|
| 28 |
+
"ORPHA:98896":("Duchenne muscular dystrophy",[("DMD", "causal")],
|
| 29 |
+
["HP:0003560", "HP:0003707", "HP:0001644", "HP:0002650"]),
|
| 30 |
+
"ORPHA:70": ("Spinal muscular atrophy 1", [("SMN1", "causal"), ("SMN2", "modifier")],
|
| 31 |
+
["HP:0003457", "HP:0001324", "HP:0002093"]),
|
| 32 |
+
"ORPHA:586": ("Cystic fibrosis", [("CFTR", "causal")],
|
| 33 |
+
["HP:0006538", "HP:0002202", "HP:0001733", "HP:0002595"]),
|
| 34 |
+
"ORPHA:355": ("Gaucher disease", [("GBA", "causal")],
|
| 35 |
+
["HP:0001433", "HP:0001744", "HP:0001873", "HP:0002240"]),
|
| 36 |
+
"ORPHA:579": ("Mucopolysaccharidosis I", [("IDUA", "causal")],
|
| 37 |
+
["HP:0001407", "HP:0000256", "HP:0001182"]),
|
| 38 |
+
"ORPHA:580": ("Mucopolysaccharidosis II", [("IDS", "causal")],
|
| 39 |
+
["HP:0001407", "HP:0000256", "HP:0001371"]),
|
| 40 |
+
"ORPHA:905": ("Wilson disease", [("ATP7B", "causal")],
|
| 41 |
+
["HP:0001399", "HP:0002171", "HP:0001337"]),
|
| 42 |
+
"ORPHA:95": ("Friedreich ataxia", [("FXN", "causal")],
|
| 43 |
+
["HP:0002066", "HP:0001251", "HP:0001638"]),
|
| 44 |
+
"ORPHA:636": ("Neurofibromatosis type 1", [("NF1", "causal")],
|
| 45 |
+
["HP:0009732", "HP:0000957", "HP:0009735"]),
|
| 46 |
+
"ORPHA:778": ("Rett syndrome", [("MECP2", "causal")],
|
| 47 |
+
["HP:0001250", "HP:0000252", "HP:0001260"]),
|
| 48 |
+
"ORPHA:100": ("Ataxia-telangiectasia", [("ATM", "causal")],
|
| 49 |
+
["HP:0001251", "HP:0001009", "HP:0002205", "HP:0002584"]),
|
| 50 |
+
}
|
| 51 |
+
# gene–gene interactions (PPI / pathway) — enables 2-hop guilt-by-association
|
| 52 |
+
_GENE_GENE = [("FBN1", "TGFBR2"), ("FBN1", "TGFBR1"), ("SMN1", "SMN2"),
|
| 53 |
+
("ATM", "TP53"), ("GBA", "SNCA"), ("DMD", "UTRN")]
|
| 54 |
+
# mechanistically-related diseases (storage disorders, neuromuscular, …)
|
| 55 |
+
_DIS_DIS = [("ORPHA:579", "ORPHA:580"), ("ORPHA:355", "ORPHA:579"),
|
| 56 |
+
("ORPHA:70", "ORPHA:98896"), ("ORPHA:95", "ORPHA:100")]
|
| 57 |
+
_HP_NAME = { # a few labels for readable evidence paths
|
| 58 |
+
"HP:0004942": "ascending aortic aneurysm", "HP:0001166": "arachnodactyly",
|
| 59 |
+
"HP:0003560": "muscular dystrophy", "HP:0003457": "EMG: neuropathic changes",
|
| 60 |
+
"HP:0006538": "recurrent pneumonia", "HP:0001433": "hepatosplenomegaly",
|
| 61 |
+
"HP:0002066": "gait ataxia", "HP:0001251": "ataxia", "HP:0001250": "seizure",
|
| 62 |
+
}
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
@dataclass
|
| 66 |
+
class KG:
|
| 67 |
+
"""Heterogeneous KG with typed adjacency. Nodes are typed string ids."""
|
| 68 |
+
nodes: list # ordered node ids
|
| 69 |
+
idx: dict # node id -> int index
|
| 70 |
+
ntype: dict # node id -> "disease"|"gene"|"phenotype"
|
| 71 |
+
adj: dict = field(default_factory=dict) # node id -> set(neighbor ids)
|
| 72 |
+
edge_label: dict = field(default_factory=dict) # (u,v) -> relation label
|
| 73 |
+
names: dict = field(default_factory=dict) # node id -> human label
|
| 74 |
+
|
| 75 |
+
def neighbors(self, u):
|
| 76 |
+
return self.adj.get(u, set())
|
| 77 |
+
|
| 78 |
+
def of_type(self, t):
|
| 79 |
+
return [n for n in self.nodes if self.ntype[n] == t]
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
def _add_edge(adj, lab, u, v, relation):
|
| 83 |
+
adj.setdefault(u, set()).add(v)
|
| 84 |
+
adj.setdefault(v, set()).add(u)
|
| 85 |
+
lab[(u, v)] = relation
|
| 86 |
+
lab[(v, u)] = relation
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
def _build_fixture() -> KG:
|
| 90 |
+
nodes, ntype, names = [], {}, {}
|
| 91 |
+
adj, lab = {}, {}
|
| 92 |
+
def ensure(nid, t, nm=None):
|
| 93 |
+
if nid not in ntype:
|
| 94 |
+
ntype[nid] = t; nodes.append(nid)
|
| 95 |
+
if nm: names[nid] = nm
|
| 96 |
+
for did, (dname, genes, phens) in _DISEASES.items():
|
| 97 |
+
ensure(did, "disease", dname)
|
| 98 |
+
for g, kind in genes:
|
| 99 |
+
ensure(g, "gene", g)
|
| 100 |
+
_add_edge(adj, lab, did, g, f"associated_with:{kind}")
|
| 101 |
+
for hp in phens:
|
| 102 |
+
ensure(hp, "phenotype", _HP_NAME.get(hp, hp))
|
| 103 |
+
_add_edge(adj, lab, did, hp, "has_phenotype")
|
| 104 |
+
for g1, g2 in _GENE_GENE:
|
| 105 |
+
ensure(g1, "gene", g1); ensure(g2, "gene", g2)
|
| 106 |
+
_add_edge(adj, lab, g1, g2, "interacts_with")
|
| 107 |
+
for d1, d2 in _DIS_DIS:
|
| 108 |
+
_add_edge(adj, lab, d1, d2, "related_to")
|
| 109 |
+
idx = {n: i for i, n in enumerate(nodes)}
|
| 110 |
+
return KG(nodes=nodes, idx=idx, ntype=ntype, adj=adj, edge_label=lab, names=names)
|
| 111 |
+
|
| 112 |
+
|
| 113 |
+
def _build_from_primekg(kg_dir: str) -> KG:
|
| 114 |
+
"""Load the production PrimeKG-derived graph (hetero_graph.json + node_ids.json)."""
|
| 115 |
+
g = json.load(open(f"{kg_dir}/hetero_graph.json"))
|
| 116 |
+
node_ids = json.load(open(f"{kg_dir}/node_ids.json"))
|
| 117 |
+
nodes, ntype, names = [], {}, {}
|
| 118 |
+
adj, lab = {}, {}
|
| 119 |
+
row2id = {t: {int(k): (f"{ '' if t=='disease' else ''}{v}") for k, v in m.items()}
|
| 120 |
+
for t, m in node_ids.items()}
|
| 121 |
+
# canonical typed ids
|
| 122 |
+
def cid(t, raw):
|
| 123 |
+
pfx = {"disease": "ORPHA:", "gene": "HGNC:", "phenotype": "HP:"}.get(t, t + ":")
|
| 124 |
+
s = str(raw)
|
| 125 |
+
return s if s.startswith(pfx) else pfx + s
|
| 126 |
+
for t, m in row2id.items():
|
| 127 |
+
for _, raw in m.items():
|
| 128 |
+
nid = cid(t, raw)
|
| 129 |
+
if nid not in ntype:
|
| 130 |
+
ntype[nid] = t; nodes.append(nid)
|
| 131 |
+
REL = {"disease__associated_with__gene": ("disease", "gene", "associated_with"),
|
| 132 |
+
"disease__has_phenotype__phenotype": ("disease", "phenotype", "has_phenotype")}
|
| 133 |
+
for et, e in g.get("edges", {}).items():
|
| 134 |
+
if et not in REL:
|
| 135 |
+
continue
|
| 136 |
+
st, dt, rel = REL[et]
|
| 137 |
+
for s, d in zip(e["src"], e["dst"]):
|
| 138 |
+
u = cid(st, row2id[st].get(int(s), s)); v = cid(dt, row2id[dt].get(int(d), d))
|
| 139 |
+
if u in ntype and v in ntype:
|
| 140 |
+
_add_edge(adj, lab, u, v, rel)
|
| 141 |
+
idx = {n: i for i, n in enumerate(nodes)}
|
| 142 |
+
return KG(nodes=nodes, idx=idx, ntype=ntype, adj=adj, edge_label=lab, names=names)
|
| 143 |
+
|
| 144 |
+
|
| 145 |
+
def load_kg(kg_dir: str | None = None) -> tuple[KG, str]:
|
| 146 |
+
"""Return (KG, source). Prefer PrimeKG on disk; else the built-in fixture."""
|
| 147 |
+
kg_dir = kg_dir or DEFAULT_KG_DIR
|
| 148 |
+
try:
|
| 149 |
+
if os.path.exists(f"{kg_dir}/hetero_graph.json"):
|
| 150 |
+
return _build_from_primekg(kg_dir), "primekg"
|
| 151 |
+
except Exception as e: # pragma: no cover
|
| 152 |
+
print(f"[kg] PrimeKG load failed ({e}); using fixture")
|
| 153 |
+
return _build_fixture(), "fixture"
|
| 154 |
+
|
| 155 |
+
|
| 156 |
+
if __name__ == "__main__":
|
| 157 |
+
kg, src = load_kg()
|
| 158 |
+
print(f"KG source: {src} | {len(kg.nodes)} nodes "
|
| 159 |
+
f"(diseases={len(kg.of_type('disease'))}, genes={len(kg.of_type('gene'))}, "
|
| 160 |
+
f"phenotypes={len(kg.of_type('phenotype'))})")
|