Raras-AI
/

gemeo-arch

@@ -1,81 +1,164 @@
-"""GEMEO v2.0 — Pillar A: KG zero-shot onset proposer (worked demonstration).
-Given a patient's primary rare disease (ORPHA), use PrimeKG graph embeddings to
-propose the most likely UNSEEN onset candidates — related diseases (comorbid
-progression), phenotypes (complications), and genes — that the patient has not
-yet manifested. This is the structural-novelty source that a frequency /
-repeat-last-code baseline cannot fabricate.
-Mechanism: cosine similarity in the 64-dim PrimeKG graph-embedding space, which
-encodes disease–disease, disease–phenotype, disease–gene proximity. Candidates
-the patient already has are excluded (the new-onset constraint).
-This is a runnable demonstration on the major rare diseases in the GEMEO/SUS
-cohort. Full quantitative recall requires phenotype-annotated trajectories
-(Mayo multimodal substrate); here we show the mechanism produces biologically
-correct candidates.
 """
-import numpy as np, json, os
-BASE = os.path.expanduser("~/rarasnet-swarm-py/gemeo/data")
-emb = np.load(f"{BASE}/graph_embeddings.npz")
-node_ids = json.load(open(f"{BASE}/node_ids.json"))
-# Build orpha_id -> (node_type, row) index
-disease_emb = emb["disease"]; phen_emb = emb["phenotype"]; gene_emb = emb["gene"]
-# node_ids[type] is {row_index_str: orpha_or_hpo_or_gene_id}
-dis_id2row = {v: int(k) for k, v in node_ids["disease"].items()}
-phen_row2id = {int(k): v for k, v in node_ids["phenotype"].items()}
-gene_row2id = {int(k): v for k, v in node_ids["gene"].items()}
-dis_row2id = {int(k): v for k, v in node_ids["disease"].items()}
-def norm(x): return x / (np.linalg.norm(x, axis=-1, keepdims=True) + 1e-8)
-D = norm(disease_emb); P = norm(phen_emb); G = norm(gene_emb)
-# Cohort rare diseases (ORPHA codes from RARE_CIDS_APAC)
-COHORT = {
-    "100": "Ataxia-telangiectasia", "646": "Niemann-Pick / Gaucher cluster",
-    "355": "Gaucher disease", "98896": "Duchenne muscular dystrophy",
-    "70": "Spinal muscular atrophy type 1", "586": "Cystic fibrosis",
-    "579": "Mucopolysaccharidosis type I", "580": "Mucopolysaccharidosis type II",
-    "905": "Wilson disease", "95": "Friedreich ataxia", "558": "Marfan syndrome",
-    "636": "Neurofibromatosis type 1", "778": "Rett syndrome", "183660": "SCID",
-}
-def propose(orpha, k=8):
-    """Return top-k unseen onset candidates (diseases, phenotypes, genes)."""
-    if orpha not in dis_id2row:
-        return None
-    row = dis_id2row[orpha]
-    q = D[row]
-    # Related diseases (comorbid/progression)
-    dsim = D @ q; dsim[row] = -1
-    top_dis = [(dis_row2id[int(i)], float(dsim[i])) for i in np.argsort(-dsim)[:k]]
-    # Related phenotypes (complications)
-    psim = P @ q
-    top_phen = [(phen_row2id[int(i)], float(psim[i])) for i in np.argsort(-psim)[:k]]
-    # Related genes
-    gsim = G @ q
-    top_gene = [(gene_row2id[int(i)], float(gsim[i])) for i in np.argsort(-gsim)[:k]]
-    return {"diseases": top_dis, "phenotypes": top_phen, "genes": top_gene}
-print("=" * 78)
-print("GEMEO v2.0 — Pillar A: KG zero-shot onset proposer (worked demonstration)")
-print("=" * 78)
-results = {}
-n_mapped = 0
-for orpha, name in COHORT.items():
-    r = propose(orpha, k=6)
-    if r is None:
-        print(f"\n[{orpha}] {name}: NOT in PrimeKG disease nodes")
-        continue
-    n_mapped += 1
-    results[orpha] = {"name": name, **r}
-    print(f"\n[ORPHA:{orpha}] {name}")
-    print(f"  → candidate comorbid-disease onsets: {[d for d,_ in r['diseases'][:5]]}")
-    print(f"  → candidate phenotype complications: {[p for p,_ in r['phenotypes'][:5]]}")
-    print(f"  → associated genes: {[g for g,_ in r['genes'][:5]]}")
-print(f"\n{n_mapped}/{len(COHORT)} cohort diseases mapped to PrimeKG.")
-json.dump(results, open("/tmp/pillar_a_demo.json", "w"), indent=2)
-print("Saved /tmp/pillar_a_demo.json")

+"""GEMEO v2.0 — Pillar A: KG onset proposer via Random-Walk-with-Restart (RWR).
+Given a patient's *manifested* clinical state (their diseases + observed
+phenotypes + known variant genes), propose the most likely UNSEEN first-onset
+candidates — genes, phenotypes (complications), and related diseases — that the
+patient has not yet had.
+Method (SOTA, training-free): **Random Walk with Restart on the heterogeneous
+knowledge graph**, the state-of-the-art guilt-by-association algorithm for
+network-based gene/disease prioritization (RWRH / MultiXrank lineage;
+Valdeolivas 2019, Bioinformatics; Picart-Armada 2023). The walk restarts from
+the patient's seed nodes with probability `r` and otherwise diffuses along typed
+edges; the stationary visitation probability ranks every node by network
+proximity to the patient's actual state. This is real link prediction by
+network propagation — not embedding cosine similarity.
+Key properties:
+  - **New-onset filter**: every already-manifested node is removed from the
+    ranking (we propose what the patient does NOT yet have).
+  - **Genomic seeding (optional)**: if variant pathogenicity scores are supplied
+    (e.g. from Evo 2 / AlphaMissense), the patient's variant-bearing genes are
+    weighted into the restart vector — the genome steers the proposal.
+  - **Traceable evidence**: each candidate ships its shortest path back to a seed.
+Runs out-of-the-box on the built-in rare-disease fixture; transparently uses the
+full PrimeKG graph when present (see rare_disease_kg.load_kg).
 """
+from __future__ import annotations
+import json, os, sys
+from collections import deque
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+from rare_disease_kg import load_kg, KG
+def rwr(kg: KG, seeds: dict[str, float], restart: float = 0.30,
+        tol: float = 1e-8, max_iter: int = 200) -> dict[str, float]:
+    """Random Walk with Restart. `seeds` maps node id -> nonneg weight.
+    Returns a stationary distribution over all nodes (column-normalized
+    transition, uniform over typed neighbors). Pure-Python sparse iteration —
+    no numpy dependency, scales fine to PrimeKG via the adjacency sets.
+    """
+    nodes = kg.nodes
+    s = {n: 0.0 for n in nodes}
+    z = sum(seeds.values()) or 1.0
+    for n, w in seeds.items():
+        if n in s:
+            s[n] = w / z
+    p = dict(s)
+    deg = {n: max(1, len(kg.neighbors(n))) for n in nodes}
+    for _ in range(max_iter):
+        nxt = {n: restart * s[n] for n in nodes}
+        for u in nodes:
+            pu = p[u]
+            if pu == 0.0:
+                continue
+            share = (1.0 - restart) * pu / deg[u]
+            for v in kg.neighbors(u):
+                nxt[v] += share
+        diff = sum(abs(nxt[n] - p[n]) for n in nodes)
+        p = nxt
+        if diff < tol:
+            break
+    return p
+def shortest_path(kg: KG, src: str, dst: str, max_hops: int = 4):
+    """BFS evidence path src→…→dst with relation labels (for auditability)."""
+    if src == dst:
+        return [src]
+    seen = {src}; q = deque([(src, [src])])
+    while q:
+        u, path = q.popleft()
+        if len(path) > max_hops + 1:
+            continue
+        for v in kg.neighbors(u):
+            if v in seen:
+                continue
+            np_ = path + [v]
+            if v == dst:
+                return np_
+            seen.add(v); q.append((v, np_))
+    return None
+def path_str(kg: KG, path) -> str:
+    if not path:
+        return "(no path within horizon)"
+    out = []
+    for a, b in zip(path, path[1:]):
+        rel = kg.edge_label.get((a, b), "—")
+        out.append(f"{a} —{rel}→ ")
+    return "".join(out) + path[-1]
+def propose(kg: KG, manifested: list[str], variant_genes: dict[str, float] | None = None,
+            k: int = 8, restart: float = 0.30):
+    """Propose unseen first-onset candidates for a patient.
+    manifested:    node ids the patient already has (diseases/phenotypes/genes)
+    variant_genes: optional {gene_id: pathogenicity in [0,1]} from a genomic
+                   model (Evo 2 / AlphaMissense) — adds genomic seeds.
+    """
+    manifested = [m for m in manifested if m in kg.idx]
+    seeds = {m: 1.0 for m in manifested}
+    if variant_genes:
+        for g, path_score in variant_genes.items():
+            if g in kg.idx:
+                seeds[g] = seeds.get(g, 0.0) + 2.0 * float(path_score)  # genome weighted up
+    if not seeds:
+        return {"error": "no seed nodes map to the KG"}
+    p = rwr(kg, seeds, restart=restart)
+    manifested_set = set(manifested)
+    out = {"genes": [], "phenotypes": [], "diseases": []}
+    bucket = {"gene": "genes", "phenotype": "phenotypes", "disease": "diseases"}
+    ranked = sorted(((n, sc) for n, sc in p.items() if sc > 0 and n not in manifested_set),
+                    key=lambda x: -x[1])
+    for nid, sc in ranked:
+        b = bucket.get(kg.ntype.get(nid))
+        if not b or len(out[b]) >= k:
+            continue
+        # evidence path back to the nearest seed
+        best_path = None
+        for seed in manifested:
+            pth = shortest_path(kg, seed, nid)
+            if pth and (best_path is None or len(pth) < len(best_path)):
+                best_path = pth
+        out[b].append({"id": nid, "name": kg.names.get(nid, nid),
+                        "rwr_score": round(sc, 6), "evidence": path_str(kg, best_path)})
+    return out
+def _demo():
+    kg, src = load_kg()
+    print("=" * 80)
+    print(f"GEMEO v2.0 — Pillar A: RWR onset proposer   [KG source: {src}, {len(kg.nodes)} nodes]")
+    print("=" * 80)
+    # A patient who presents as Marfan (disease known) — what onsets does the KG propose?
+    cases = [
+        ("Marfan presentation", ["ORPHA:558"], None),
+        ("Marfan, genome-seeded by a pathogenic FBN1 variant",
+         ["HP:0004942"], {"FBN1": 0.97}),   # only an aortic phenotype + a genomic hit, no disease label
+        ("Duchenne presentation", ["ORPHA:98896"], None),
+    ]
+    out = {}
+    for label, manifested, variants in cases:
+        r = propose(kg, manifested, variant_genes=variants, k=5)
+        out[label] = {"manifested": manifested, "variants": variants, **r}
+        print(f"\n[{label}]   seeds={manifested}  variants={variants or '—'}")
+        for kind in ("genes", "phenotypes", "diseases"):
+            if r.get(kind):
+                print(f"  {kind}:")
+                for c in r[kind][:4]:
+                    print(f"    • {c['id']} ({c['name']})  rwr={c['rwr_score']}")
+                    print(f"        evidence: {c['evidence']}")
+    json.dump(out, open("/tmp/pillar_a_demo.json", "w"), indent=2)
+    print("\nNew-onset filter: every already-manifested node is excluded from the ranking.")
+    print("Genomic seeding: pass variant_genes={gene: pathogenicity} from Evo 2 / AlphaMissense.")
+    print("Saved /tmp/pillar_a_demo.json")
+if __name__ == "__main__":
+    _demo()