gemeo-twin-stack / src /gemeo /seed_aura.py
timmers's picture
GEMEO world-model β€” initial release (module + NeuralSurv ckpt + RareBench v49 + KG embeddings)
089d665 verified
"""Seed an empty Aura instance with the minimal KG Gemeo needs.
Without external Disease/Phenotype/Gene nodes, the cohort, subgraph,
drug repurposing, family, reverse-pheno, and PCDT-compliance heads all
return empty (graceful degrade). This script populates a curated rare-
disease subset (~20 diseases, ~80 phenotypes, ~25 genes, plus a few
synthetic confirmed-dx PatientSpaces for cohort retrieval) so the SOTA
layer can demonstrate non-trivial behaviour.
Run: python -m gemeo.seed_aura
Idempotent: uses MERGE for every node. Safe to re-run.
"""
from __future__ import annotations
import asyncio
import logging
import sys
logger = logging.getLogger("gemeo.seed_aura")
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(name)s] %(message)s")
# ─── Curated minimal KG (pediatric / Brazilian rare disease focus) ────────
DISEASES = [
# AT
{"orpha": "100", "name": "Ataxia-telangiectasia", "name_pt": "Ataxia-telangiectasia",
"cid10": "G11.3", "inheritance": "autosomal recessive",
"prevalence": "1-9/1000000", "summary": "Progressive cerebellar ataxia, immunodeficiency, lymphoid malignancy risk."},
# NPC
{"orpha": "646", "name": "Niemann-Pick disease type C", "name_pt": "DoenΓ§a de Niemann-Pick tipo C",
"cid10": "E75.2", "inheritance": "autosomal recessive",
"prevalence": "1-9/100000", "summary": "Lysosomal lipid storage; vertical supranuclear gaze palsy, hepatosplenomegaly."},
# Gaucher
{"orpha": "355", "name": "Gaucher disease", "name_pt": "DoenΓ§a de Gaucher",
"cid10": "E75.2", "inheritance": "autosomal recessive",
"prevalence": "1-9/100000", "summary": "GBA deficiency; hepatosplenomegaly, bone pain, anemia."},
# Fabry
{"orpha": "324", "name": "Fabry disease", "name_pt": "DoenΓ§a de Fabry",
"cid10": "E75.2", "inheritance": "X-linked recessive",
"prevalence": "1-9/100000", "summary": "GLA deficiency; acroparesthesias, angiokeratomas, renal/cardiac involvement."},
# Pompe
{"orpha": "365", "name": "Glycogen storage disease due to acid maltase deficiency", "name_pt": "DoenΓ§a de Pompe",
"cid10": "E74.0", "inheritance": "autosomal recessive",
"prevalence": "1-9/100000", "summary": "GAA deficiency; hypotonia, cardiomyopathy in infants."},
# MPS I
{"orpha": "579", "name": "Mucopolysaccharidosis type 1", "name_pt": "MPS I (Hurler-Scheie)",
"cid10": "E76.0", "inheritance": "autosomal recessive",
"prevalence": "1-9/100000", "summary": "IDUA deficiency; coarse facies, dysostosis multiplex."},
# SMA
{"orpha": "70", "name": "Proximal spinal muscular atrophy type 1", "name_pt": "Atrofia Muscular Espinhal tipo 1",
"cid10": "G12.0", "inheritance": "autosomal recessive",
"prevalence": "1-9/100000", "summary": "SMN1 loss; severe infantile hypotonia, respiratory failure."},
# Wilson
{"orpha": "905", "name": "Wilson disease", "name_pt": "DoenΓ§a de Wilson",
"cid10": "E83.0", "inheritance": "autosomal recessive",
"prevalence": "1-9/100000", "summary": "ATP7B deficiency; hepatic, neurologic, Kayser-Fleischer rings."},
# DMD
{"orpha": "98896", "name": "Duchenne muscular dystrophy", "name_pt": "Distrofia muscular de Duchenne",
"cid10": "G71.0", "inheritance": "X-linked recessive",
"prevalence": "1-9/100000", "summary": "DMD gene mutation; progressive muscle weakness, Gowers sign."},
# CF
{"orpha": "586", "name": "Cystic fibrosis", "name_pt": "Fibrose cΓ­stica",
"cid10": "E84", "inheritance": "autosomal recessive",
"prevalence": "1-9/100000", "summary": "CFTR; recurrent infections, pancreatic insufficiency."},
# PKU
{"orpha": "716", "name": "Phenylketonuria", "name_pt": "FenilcetonΓΊria",
"cid10": "E70.0", "inheritance": "autosomal recessive",
"prevalence": "1-9/100000", "summary": "PAH deficiency; intellectual disability if untreated."},
# Marfan
{"orpha": "558", "name": "Marfan syndrome", "name_pt": "SΓ­ndrome de Marfan",
"cid10": "Q87.4", "inheritance": "autosomal dominant",
"prevalence": "1-9/100000", "summary": "FBN1; aortic dilation, ectopia lentis, tall stature."},
# NF1
{"orpha": "636", "name": "Neurofibromatosis type 1", "name_pt": "Neurofibromatose tipo 1",
"cid10": "Q85.0", "inheritance": "autosomal dominant",
"prevalence": "1-9/10000", "summary": "NF1; cafΓ©-au-lait, neurofibromas."},
# Rett
{"orpha": "778", "name": "Rett syndrome", "name_pt": "SΓ­ndrome de Rett",
"cid10": "F84.2", "inheritance": "X-linked dominant",
"prevalence": "1-9/100000", "summary": "MECP2; regression, stereotypies, mostly females."},
# Friedreich ataxia
{"orpha": "95", "name": "Friedreich ataxia", "name_pt": "Ataxia de Friedreich",
"cid10": "G11.1", "inheritance": "autosomal recessive",
"prevalence": "1-9/100000", "summary": "FXN GAA expansion; gait ataxia, cardiomyopathy, diabetes."},
# Pelizaeus-Merzbacher
{"orpha": "702", "name": "Pelizaeus-Merzbacher disease", "name_pt": "DoenΓ§a de Pelizaeus-Merzbacher",
"cid10": "E75.2", "inheritance": "X-linked recessive",
"prevalence": "1-9/1000000", "summary": "PLP1; nystagmus, hypotonia, leukodystrophy."},
# Hunter MPS II
{"orpha": "580", "name": "Mucopolysaccharidosis type 2", "name_pt": "MPS II (Hunter)",
"cid10": "E76.1", "inheritance": "X-linked recessive",
"prevalence": "1-9/100000", "summary": "IDS deficiency; coarse facies, hepatosplenomegaly."},
# SCID
{"orpha": "183660", "name": "Severe combined immunodeficiency", "name_pt": "ImunodeficiΓͺncia combinada grave",
"cid10": "D81.1", "inheritance": "various",
"prevalence": "1-9/100000", "summary": "Multiple genes; severe T/B/NK deficiency, opportunistic infections."},
# CAH
{"orpha": "90794", "name": "Congenital adrenal hyperplasia due to 21-hydroxylase deficiency", "name_pt": "Hiperplasia adrenal congΓͺnita",
"cid10": "E25.0", "inheritance": "autosomal recessive",
"prevalence": "1-9/100000", "summary": "CYP21A2; salt-wasting, virilization."},
# Hereditary hemolytic spherocytosis (added since H histΓ³rico familiar)
{"orpha": "822", "name": "Hereditary spherocytosis", "name_pt": "Esferocitose hereditΓ‘ria",
"cid10": "D58.0", "inheritance": "autosomal dominant",
"prevalence": "1-9/10000", "summary": "Membrane defect; hemolysis, splenomegaly, jaundice."},
]
# HPO terms (curated, with PT-BR labels for the most clinically common)
PHENOTYPES = [
("HP:0001250", "Seizure", "ConvulsΓ£o"),
("HP:0001251", "Ataxia", "Ataxia"),
("HP:0001260", "Dysarthria", "Disartria"),
("HP:0001009", "Telangiectasia", "Telangiectasia"),
("HP:0033106", "Increased serum alpha-fetoprotein", "AFP sΓ©rica elevada"),
("HP:0002720", "Decreased circulating IgA level", "IgA baixa"),
("HP:0006532", "Recurrent pneumonia", "Pneumonia recorrente"),
("HP:0002121", "Cerebellar atrophy", "Atrofia cerebelar"),
("HP:0000639", "Nystagmus", "Nistagmo"),
("HP:0001270", "Motor delay", "Atraso motor"),
("HP:0001249", "Intellectual disability", "DeficiΓͺncia intelectual"),
("HP:0002240", "Hepatomegaly", "Hepatomegalia"),
("HP:0001744", "Splenomegaly", "Esplenomegalia"),
("HP:0000505", "Visual impairment", "Comprometimento visual"),
("HP:0001392", "Abnormal liver", "AlteraΓ§Γ£o hepΓ‘tica"),
("HP:0001627", "Abnormal heart morphology", "AlteraΓ§Γ£o cardΓ­aca"),
("HP:0001638", "Cardiomyopathy", "Cardiomiopatia"),
("HP:0002715", "Abnormality of the immune system", "ImunodeficiΓͺncia"),
("HP:0007354", "Amyotrophic lateral sclerosis-like", "SΓ­ndrome ELA-like"),
("HP:0002493", "Upper motor neuron dysfunction", "DisfunΓ§Γ£o do neurΓ΄nio motor superior"),
("HP:0001257", "Spasticity", "Espasticidade"),
("HP:0001284", "Areflexia", "Arreflexia"),
("HP:0008936", "Muscular hypotonia of the trunk", "Hipotonia troncular"),
("HP:0003236", "Elevated circulating creatine kinase", "CK elevada"),
("HP:0001626", "Abnormal vasculature", "AlteraΓ§Γ£o vascular"),
("HP:0001392", "Liver dysfunction", "DisfunΓ§Γ£o hepΓ‘tica"),
("HP:0002910", "Elevated hepatic transaminase", "Transaminases elevadas"),
("HP:0007360", "Aplasia/Hypoplasia of the cerebellum", "Hipoplasia cerebelar"),
("HP:0001263", "Global developmental delay", "Atraso global do desenvolvimento"),
("HP:0000252", "Microcephaly", "Microcefalia"),
("HP:0001876", "Pancytopenia", "Pancitopenia"),
("HP:0001873", "Thrombocytopenia", "Plaquetopenia"),
("HP:0001508", "Failure to thrive", "Baixo ganho ponderal"),
("HP:0002013", "Vomiting", "VΓ΄mitos"),
("HP:0002643", "Neonatal respiratory distress", "InsuficiΓͺncia respiratΓ³ria neonatal"),
("HP:0011675", "Arrhythmia", "Arritmia"),
("HP:0001417", "X-linked inheritance", "HeranΓ§a ligada ao X"),
("HP:0001425", "Heterogeneous", "HeterogΓͺneo"),
("HP:0010972", "Anemia of inadequate production", "Anemia hipoproliferativa"),
("HP:0001873", "Hemolytic anemia", "Anemia hemolΓ­tica"),
("HP:0001081", "Cholelithiasis", "ColelitΓ­ase"),
("HP:0006530", "Interstitial pulmonary disease", "Pneumopatia intersticial"),
("HP:0010301", "Spinal cord compression", "CompressΓ£o medular"),
("HP:0040315", "B-cell lymphoma", "Linfoma de cΓ©lulas B"),
("HP:0010785", "T-cell lymphoma", "Linfoma de cΓ©lulas T"),
]
# Genes
GENES = [
("ATM", "11q22.3"),
("NPC1", "18q11.2"),
("NPC2", "14q24.3"),
("GBA", "1q22"),
("GLA", "Xq22.1"),
("GAA", "17q25.3"),
("IDUA", "4p16.3"),
("IDS", "Xq28"),
("SMN1", "5q13.2"),
("SMN2", "5q13.2"),
("ATP7B", "13q14.3"),
("DMD", "Xp21.2"),
("CFTR", "7q31.2"),
("PAH", "12q23.2"),
("FBN1", "15q21.1"),
("NF1", "17q11.2"),
("MECP2", "Xq28"),
("FXN", "9q21.11"),
("PLP1", "Xq22.2"),
("CYP21A2", "6p21.33"),
("ANK1", "8p11.21"),
("SPTB", "14q23.3"),
("IL2RG", "Xq13.1"),
("ADA", "20q13.12"),
("RAG1", "11p12"),
("RAG2", "11p12"),
]
# Disease ↔ Phenotype edges (with frequency)
DISEASE_PHENOTYPES = [
# AT (ORPHA:100)
("100", "HP:0001251", 0.95), # ataxia
("100", "HP:0001009", 0.85), # telangiectasia
("100", "HP:0033106", 0.95), # AFP elevated
("100", "HP:0002720", 0.80), # IgA low
("100", "HP:0006532", 0.85), # recurrent pneumonia
("100", "HP:0002121", 0.90), # cerebellar atrophy
("100", "HP:0001260", 0.80), # dysarthria
("100", "HP:0002715", 0.85), # immunodeficiency
("100", "HP:0040315", 0.25), # B-cell lymphoma (lifetime)
("100", "HP:0010785", 0.20), # T-cell lymphoma
("100", "HP:0006530", 0.50), # interstitial pulm disease (older)
# NPC (ORPHA:646)
("646", "HP:0001251", 0.85),
("646", "HP:0002240", 0.70), # hepatomegaly
("646", "HP:0001744", 0.80), # splenomegaly
("646", "HP:0001249", 0.75), # ID
("646", "HP:0001260", 0.60),
("646", "HP:0000505", 0.70), # vertical gaze
# Gaucher (ORPHA:355)
("355", "HP:0002240", 0.85),
("355", "HP:0001744", 0.95),
("355", "HP:0001873", 0.70), # thrombocytopenia
# Fabry (ORPHA:324)
("324", "HP:0001626", 0.80),
("324", "HP:0001638", 0.70),
# Pompe (ORPHA:365)
("365", "HP:0001638", 0.95),
("365", "HP:0008936", 0.95),
("365", "HP:0003236", 0.90),
# MPS I (ORPHA:579)
("579", "HP:0002240", 0.85),
("579", "HP:0001744", 0.70),
# SMA-1 (ORPHA:70)
("70", "HP:0008936", 0.99),
("70", "HP:0001284", 0.90),
("70", "HP:0001270", 0.99),
# Wilson (ORPHA:905)
("905", "HP:0002910", 0.85),
("905", "HP:0001392", 0.90),
# DMD (ORPHA:98896)
("98896", "HP:0001270", 0.95),
("98896", "HP:0003236", 0.95),
("98896", "HP:0001638", 0.60),
# CF (ORPHA:586)
("586", "HP:0006532", 0.95),
("586", "HP:0001508", 0.75),
# PKU (ORPHA:716)
("716", "HP:0001249", 0.85),
# Friedreich (ORPHA:95)
("95", "HP:0001251", 0.95),
("95", "HP:0001638", 0.70),
# Spherocytosis (ORPHA:822)
("822", "HP:0001873", 0.95), # hemolytic anemia
("822", "HP:0001744", 0.80),
("822", "HP:0001081", 0.50),
]
# Disease ↔ Gene
DISEASE_GENES = [
("100", "ATM", "pathogenic"),
("646", "NPC1", "pathogenic"),
("646", "NPC2", "pathogenic"),
("355", "GBA", "pathogenic"),
("324", "GLA", "pathogenic"),
("365", "GAA", "pathogenic"),
("579", "IDUA", "pathogenic"),
("580", "IDS", "pathogenic"),
("70", "SMN1", "pathogenic"),
("905", "ATP7B", "pathogenic"),
("98896", "DMD", "pathogenic"),
("586", "CFTR", "pathogenic"),
("716", "PAH", "pathogenic"),
("558", "FBN1", "pathogenic"),
("636", "NF1", "pathogenic"),
("778", "MECP2", "pathogenic"),
("95", "FXN", "pathogenic"),
("702", "PLP1", "pathogenic"),
("90794", "CYP21A2", "pathogenic"),
("822", "ANK1", "pathogenic"),
("822", "SPTB", "pathogenic"),
("183660", "IL2RG", "pathogenic"),
("183660", "ADA", "pathogenic"),
("183660", "RAG1", "pathogenic"),
]
# Synthetic confirmed-dx PatientSpaces (so cohort retrieval surfaces real exemplars)
# Each is a confirmed AT/NPC/Gaucher/etc patient with a small HPO panel.
SYNTHETIC_PATIENTS = [
{"space_id": "synth-at-001", "orpha": "100", "state": "SP", "hpos": ["HP:0001251", "HP:0001009", "HP:0033106", "HP:0006532"], "age": 8},
{"space_id": "synth-at-002", "orpha": "100", "state": "RJ", "hpos": ["HP:0001251", "HP:0033106", "HP:0002715"], "age": 10},
{"space_id": "synth-at-003", "orpha": "100", "state": "MG", "hpos": ["HP:0001251", "HP:0001009", "HP:0002121", "HP:0006532"], "age": 7},
{"space_id": "synth-at-004", "orpha": "100", "state": "BA", "hpos": ["HP:0001251", "HP:0033106", "HP:0040315"], "age": 14},
{"space_id": "synth-at-005", "orpha": "100", "state": "SP", "hpos": ["HP:0001251", "HP:0001009", "HP:0006530"], "age": 12},
{"space_id": "synth-npc-001", "orpha": "646", "state": "SP", "hpos": ["HP:0001251", "HP:0001744", "HP:0002240"], "age": 9},
{"space_id": "synth-npc-002", "orpha": "646", "state": "RS", "hpos": ["HP:0001251", "HP:0001249", "HP:0000505"], "age": 11},
{"space_id": "synth-gaucher-001", "orpha": "355", "state": "SP", "hpos": ["HP:0002240", "HP:0001744", "HP:0001873"], "age": 6},
{"space_id": "synth-fabry-001", "orpha": "324", "state": "PR", "hpos": ["HP:0001626", "HP:0001638"], "age": 16},
{"space_id": "synth-pompe-001", "orpha": "365", "state": "SP", "hpos": ["HP:0001638", "HP:0008936"], "age": 1},
{"space_id": "synth-sma1-001", "orpha": "70", "state": "RJ", "hpos": ["HP:0008936", "HP:0001284", "HP:0002643"], "age": 1},
{"space_id": "synth-dmd-001", "orpha": "98896", "state": "MG", "hpos": ["HP:0001270", "HP:0003236"], "age": 5},
]
async def _q(cypher, params=None):
from space_graph import _safe_query
return await _safe_query(cypher, params or {})
async def seed():
print("Seeding Aura with curated rare-disease KG...")
# 1. Diseases
for d in DISEASES:
await _q("""
MERGE (x:Disease {orphaCode: $orpha})
SET x.name = $name,
x.cid10DescriptionPt = $name_pt,
x.cid10 = $cid10,
x.inheritance = $inheritance,
x.prevalence = $prevalence,
x.summary = $summary
""", d)
print(f" βœ“ {len(DISEASES)} Disease nodes")
# 2. Phenotypes (HPO)
for hpo, name, name_pt in PHENOTYPES:
await _q("""
MERGE (p:Phenotype {hpoId: $hpo})
SET p.name = $name, p.name_pt = $name_pt
""", {"hpo": hpo, "name": name, "name_pt": name_pt})
print(f" βœ“ {len(PHENOTYPES)} Phenotype nodes")
# 3. Genes
for sym, loc in GENES:
await _q("""
MERGE (g:Gene {symbol: $sym})
SET g.location = $loc
""", {"sym": sym, "loc": loc})
print(f" βœ“ {len(GENES)} Gene nodes")
# 4. Disease ↔ Phenotype
for orpha, hpo, freq in DISEASE_PHENOTYPES:
await _q("""
MATCH (d:Disease {orphaCode: $orpha})
MATCH (p:Phenotype {hpoId: $hpo})
MERGE (d)-[r:HAS_PHENOTYPE]->(p)
SET r.frequency = $freq
""", {"orpha": orpha, "hpo": hpo, "freq": freq})
print(f" βœ“ {len(DISEASE_PHENOTYPES)} Disease-Phenotype edges")
# 5. Disease ↔ Gene
for orpha, sym, path in DISEASE_GENES:
await _q("""
MATCH (d:Disease {orphaCode: $orpha})
MATCH (g:Gene {symbol: $sym})
MERGE (d)-[r:ASSOCIATED_WITH]->(g)
SET r.pathogenicity = $path
""", {"orpha": orpha, "sym": sym, "path": path})
print(f" βœ“ {len(DISEASE_GENES)} Disease-Gene edges")
# 6. Synthetic PatientSpaces (so cohort works)
for p in SYNTHETIC_PATIENTS:
await _q("""
MERGE (s:PatientSpace {space_id: $space_id})
SET s.state = $state, s.age = $age, s.is_synthetic = true
""", {"space_id": p["space_id"], "state": p["state"], "age": p["age"]})
# Confirmed hypothesis pointing at the disease
await _q("""
MATCH (s:PatientSpace {space_id: $space_id})
MERGE (h:SpaceHypothesis {hypothesis_id: $space_id + '-h0'})
SET h.status = 'confirmed', h.orpha_code = $orpha, h.probability = 0.95
MERGE (s)-[:HAS_HYPOTHESIS]->(h)
""", p)
await _q("""
MATCH (h:SpaceHypothesis {hypothesis_id: $space_id + '-h0'})
MATCH (d:Disease {orphaCode: $orpha})
MERGE (h)-[:TARGETS]->(d)
""", p)
# SpaceEvent β†’ Phenotypes
for i, hpo in enumerate(p["hpos"]):
eid = f"{p['space_id']}-e{i}"
await _q("""
MATCH (s:PatientSpace {space_id: $space_id})
MERGE (e:SpaceEvent {event_id: $eid})
SET e.event_type = 'symptom_onset', e.title = $hpo
MERGE (s)-[:HAS_EVENT]->(e)
""", {"space_id": p["space_id"], "eid": eid, "hpo": hpo})
await _q("""
MATCH (e:SpaceEvent {event_id: $eid})
MATCH (p:Phenotype {hpoId: $hpo})
MERGE (e)-[:INVOLVES]->(p)
""", {"eid": eid, "hpo": hpo})
print(f" βœ“ {len(SYNTHETIC_PATIENTS)} synthetic PatientSpaces seeded")
# Verify
counts = await _q("""
MATCH (n) RETURN labels(n)[0] AS l, count(n) AS c
ORDER BY c DESC LIMIT 20
""")
print("\nFinal Aura inventory:")
for r in counts:
print(f" {r['l']}: {r['c']}")
if __name__ == "__main__":
asyncio.run(seed())