"""Seed an empty Aura instance with the minimal KG Gemeo needs. Without external Disease/Phenotype/Gene nodes, the cohort, subgraph, drug repurposing, family, reverse-pheno, and PCDT-compliance heads all return empty (graceful degrade). This script populates a curated rare- disease subset (~20 diseases, ~80 phenotypes, ~25 genes, plus a few synthetic confirmed-dx PatientSpaces for cohort retrieval) so the SOTA layer can demonstrate non-trivial behaviour. Run: python -m gemeo.seed_aura Idempotent: uses MERGE for every node. Safe to re-run. """ from __future__ import annotations import asyncio import logging import sys logger = logging.getLogger("gemeo.seed_aura") logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(name)s] %(message)s") # ─── Curated minimal KG (pediatric / Brazilian rare disease focus) ──────── DISEASES = [ # AT {"orpha": "100", "name": "Ataxia-telangiectasia", "name_pt": "Ataxia-telangiectasia", "cid10": "G11.3", "inheritance": "autosomal recessive", "prevalence": "1-9/1000000", "summary": "Progressive cerebellar ataxia, immunodeficiency, lymphoid malignancy risk."}, # NPC {"orpha": "646", "name": "Niemann-Pick disease type C", "name_pt": "Doença de Niemann-Pick tipo C", "cid10": "E75.2", "inheritance": "autosomal recessive", "prevalence": "1-9/100000", "summary": "Lysosomal lipid storage; vertical supranuclear gaze palsy, hepatosplenomegaly."}, # Gaucher {"orpha": "355", "name": "Gaucher disease", "name_pt": "Doença de Gaucher", "cid10": "E75.2", "inheritance": "autosomal recessive", "prevalence": "1-9/100000", "summary": "GBA deficiency; hepatosplenomegaly, bone pain, anemia."}, # Fabry {"orpha": "324", "name": "Fabry disease", "name_pt": "Doença de Fabry", "cid10": "E75.2", "inheritance": "X-linked recessive", "prevalence": "1-9/100000", "summary": "GLA deficiency; acroparesthesias, angiokeratomas, renal/cardiac involvement."}, # Pompe {"orpha": "365", "name": "Glycogen storage disease due to acid maltase deficiency", "name_pt": "Doença de Pompe", "cid10": "E74.0", "inheritance": "autosomal recessive", "prevalence": "1-9/100000", "summary": "GAA deficiency; hypotonia, cardiomyopathy in infants."}, # MPS I {"orpha": "579", "name": "Mucopolysaccharidosis type 1", "name_pt": "MPS I (Hurler-Scheie)", "cid10": "E76.0", "inheritance": "autosomal recessive", "prevalence": "1-9/100000", "summary": "IDUA deficiency; coarse facies, dysostosis multiplex."}, # SMA {"orpha": "70", "name": "Proximal spinal muscular atrophy type 1", "name_pt": "Atrofia Muscular Espinhal tipo 1", "cid10": "G12.0", "inheritance": "autosomal recessive", "prevalence": "1-9/100000", "summary": "SMN1 loss; severe infantile hypotonia, respiratory failure."}, # Wilson {"orpha": "905", "name": "Wilson disease", "name_pt": "Doença de Wilson", "cid10": "E83.0", "inheritance": "autosomal recessive", "prevalence": "1-9/100000", "summary": "ATP7B deficiency; hepatic, neurologic, Kayser-Fleischer rings."}, # DMD {"orpha": "98896", "name": "Duchenne muscular dystrophy", "name_pt": "Distrofia muscular de Duchenne", "cid10": "G71.0", "inheritance": "X-linked recessive", "prevalence": "1-9/100000", "summary": "DMD gene mutation; progressive muscle weakness, Gowers sign."}, # CF {"orpha": "586", "name": "Cystic fibrosis", "name_pt": "Fibrose cística", "cid10": "E84", "inheritance": "autosomal recessive", "prevalence": "1-9/100000", "summary": "CFTR; recurrent infections, pancreatic insufficiency."}, # PKU {"orpha": "716", "name": "Phenylketonuria", "name_pt": "Fenilcetonúria", "cid10": "E70.0", "inheritance": "autosomal recessive", "prevalence": "1-9/100000", "summary": "PAH deficiency; intellectual disability if untreated."}, # Marfan {"orpha": "558", "name": "Marfan syndrome", "name_pt": "Síndrome de Marfan", "cid10": "Q87.4", "inheritance": "autosomal dominant", "prevalence": "1-9/100000", "summary": "FBN1; aortic dilation, ectopia lentis, tall stature."}, # NF1 {"orpha": "636", "name": "Neurofibromatosis type 1", "name_pt": "Neurofibromatose tipo 1", "cid10": "Q85.0", "inheritance": "autosomal dominant", "prevalence": "1-9/10000", "summary": "NF1; café-au-lait, neurofibromas."}, # Rett {"orpha": "778", "name": "Rett syndrome", "name_pt": "Síndrome de Rett", "cid10": "F84.2", "inheritance": "X-linked dominant", "prevalence": "1-9/100000", "summary": "MECP2; regression, stereotypies, mostly females."}, # Friedreich ataxia {"orpha": "95", "name": "Friedreich ataxia", "name_pt": "Ataxia de Friedreich", "cid10": "G11.1", "inheritance": "autosomal recessive", "prevalence": "1-9/100000", "summary": "FXN GAA expansion; gait ataxia, cardiomyopathy, diabetes."}, # Pelizaeus-Merzbacher {"orpha": "702", "name": "Pelizaeus-Merzbacher disease", "name_pt": "Doença de Pelizaeus-Merzbacher", "cid10": "E75.2", "inheritance": "X-linked recessive", "prevalence": "1-9/1000000", "summary": "PLP1; nystagmus, hypotonia, leukodystrophy."}, # Hunter MPS II {"orpha": "580", "name": "Mucopolysaccharidosis type 2", "name_pt": "MPS II (Hunter)", "cid10": "E76.1", "inheritance": "X-linked recessive", "prevalence": "1-9/100000", "summary": "IDS deficiency; coarse facies, hepatosplenomegaly."}, # SCID {"orpha": "183660", "name": "Severe combined immunodeficiency", "name_pt": "Imunodeficiência combinada grave", "cid10": "D81.1", "inheritance": "various", "prevalence": "1-9/100000", "summary": "Multiple genes; severe T/B/NK deficiency, opportunistic infections."}, # CAH {"orpha": "90794", "name": "Congenital adrenal hyperplasia due to 21-hydroxylase deficiency", "name_pt": "Hiperplasia adrenal congênita", "cid10": "E25.0", "inheritance": "autosomal recessive", "prevalence": "1-9/100000", "summary": "CYP21A2; salt-wasting, virilization."}, # Hereditary hemolytic spherocytosis (added since H histórico familiar) {"orpha": "822", "name": "Hereditary spherocytosis", "name_pt": "Esferocitose hereditária", "cid10": "D58.0", "inheritance": "autosomal dominant", "prevalence": "1-9/10000", "summary": "Membrane defect; hemolysis, splenomegaly, jaundice."}, ] # HPO terms (curated, with PT-BR labels for the most clinically common) PHENOTYPES = [ ("HP:0001250", "Seizure", "Convulsão"), ("HP:0001251", "Ataxia", "Ataxia"), ("HP:0001260", "Dysarthria", "Disartria"), ("HP:0001009", "Telangiectasia", "Telangiectasia"), ("HP:0033106", "Increased serum alpha-fetoprotein", "AFP sérica elevada"), ("HP:0002720", "Decreased circulating IgA level", "IgA baixa"), ("HP:0006532", "Recurrent pneumonia", "Pneumonia recorrente"), ("HP:0002121", "Cerebellar atrophy", "Atrofia cerebelar"), ("HP:0000639", "Nystagmus", "Nistagmo"), ("HP:0001270", "Motor delay", "Atraso motor"), ("HP:0001249", "Intellectual disability", "Deficiência intelectual"), ("HP:0002240", "Hepatomegaly", "Hepatomegalia"), ("HP:0001744", "Splenomegaly", "Esplenomegalia"), ("HP:0000505", "Visual impairment", "Comprometimento visual"), ("HP:0001392", "Abnormal liver", "Alteração hepática"), ("HP:0001627", "Abnormal heart morphology", "Alteração cardíaca"), ("HP:0001638", "Cardiomyopathy", "Cardiomiopatia"), ("HP:0002715", "Abnormality of the immune system", "Imunodeficiência"), ("HP:0007354", "Amyotrophic lateral sclerosis-like", "Síndrome ELA-like"), ("HP:0002493", "Upper motor neuron dysfunction", "Disfunção do neurônio motor superior"), ("HP:0001257", "Spasticity", "Espasticidade"), ("HP:0001284", "Areflexia", "Arreflexia"), ("HP:0008936", "Muscular hypotonia of the trunk", "Hipotonia troncular"), ("HP:0003236", "Elevated circulating creatine kinase", "CK elevada"), ("HP:0001626", "Abnormal vasculature", "Alteração vascular"), ("HP:0001392", "Liver dysfunction", "Disfunção hepática"), ("HP:0002910", "Elevated hepatic transaminase", "Transaminases elevadas"), ("HP:0007360", "Aplasia/Hypoplasia of the cerebellum", "Hipoplasia cerebelar"), ("HP:0001263", "Global developmental delay", "Atraso global do desenvolvimento"), ("HP:0000252", "Microcephaly", "Microcefalia"), ("HP:0001876", "Pancytopenia", "Pancitopenia"), ("HP:0001873", "Thrombocytopenia", "Plaquetopenia"), ("HP:0001508", "Failure to thrive", "Baixo ganho ponderal"), ("HP:0002013", "Vomiting", "Vômitos"), ("HP:0002643", "Neonatal respiratory distress", "Insuficiência respiratória neonatal"), ("HP:0011675", "Arrhythmia", "Arritmia"), ("HP:0001417", "X-linked inheritance", "Herança ligada ao X"), ("HP:0001425", "Heterogeneous", "Heterogêneo"), ("HP:0010972", "Anemia of inadequate production", "Anemia hipoproliferativa"), ("HP:0001873", "Hemolytic anemia", "Anemia hemolítica"), ("HP:0001081", "Cholelithiasis", "Colelitíase"), ("HP:0006530", "Interstitial pulmonary disease", "Pneumopatia intersticial"), ("HP:0010301", "Spinal cord compression", "Compressão medular"), ("HP:0040315", "B-cell lymphoma", "Linfoma de células B"), ("HP:0010785", "T-cell lymphoma", "Linfoma de células T"), ] # Genes GENES = [ ("ATM", "11q22.3"), ("NPC1", "18q11.2"), ("NPC2", "14q24.3"), ("GBA", "1q22"), ("GLA", "Xq22.1"), ("GAA", "17q25.3"), ("IDUA", "4p16.3"), ("IDS", "Xq28"), ("SMN1", "5q13.2"), ("SMN2", "5q13.2"), ("ATP7B", "13q14.3"), ("DMD", "Xp21.2"), ("CFTR", "7q31.2"), ("PAH", "12q23.2"), ("FBN1", "15q21.1"), ("NF1", "17q11.2"), ("MECP2", "Xq28"), ("FXN", "9q21.11"), ("PLP1", "Xq22.2"), ("CYP21A2", "6p21.33"), ("ANK1", "8p11.21"), ("SPTB", "14q23.3"), ("IL2RG", "Xq13.1"), ("ADA", "20q13.12"), ("RAG1", "11p12"), ("RAG2", "11p12"), ] # Disease ↔ Phenotype edges (with frequency) DISEASE_PHENOTYPES = [ # AT (ORPHA:100) ("100", "HP:0001251", 0.95), # ataxia ("100", "HP:0001009", 0.85), # telangiectasia ("100", "HP:0033106", 0.95), # AFP elevated ("100", "HP:0002720", 0.80), # IgA low ("100", "HP:0006532", 0.85), # recurrent pneumonia ("100", "HP:0002121", 0.90), # cerebellar atrophy ("100", "HP:0001260", 0.80), # dysarthria ("100", "HP:0002715", 0.85), # immunodeficiency ("100", "HP:0040315", 0.25), # B-cell lymphoma (lifetime) ("100", "HP:0010785", 0.20), # T-cell lymphoma ("100", "HP:0006530", 0.50), # interstitial pulm disease (older) # NPC (ORPHA:646) ("646", "HP:0001251", 0.85), ("646", "HP:0002240", 0.70), # hepatomegaly ("646", "HP:0001744", 0.80), # splenomegaly ("646", "HP:0001249", 0.75), # ID ("646", "HP:0001260", 0.60), ("646", "HP:0000505", 0.70), # vertical gaze # Gaucher (ORPHA:355) ("355", "HP:0002240", 0.85), ("355", "HP:0001744", 0.95), ("355", "HP:0001873", 0.70), # thrombocytopenia # Fabry (ORPHA:324) ("324", "HP:0001626", 0.80), ("324", "HP:0001638", 0.70), # Pompe (ORPHA:365) ("365", "HP:0001638", 0.95), ("365", "HP:0008936", 0.95), ("365", "HP:0003236", 0.90), # MPS I (ORPHA:579) ("579", "HP:0002240", 0.85), ("579", "HP:0001744", 0.70), # SMA-1 (ORPHA:70) ("70", "HP:0008936", 0.99), ("70", "HP:0001284", 0.90), ("70", "HP:0001270", 0.99), # Wilson (ORPHA:905) ("905", "HP:0002910", 0.85), ("905", "HP:0001392", 0.90), # DMD (ORPHA:98896) ("98896", "HP:0001270", 0.95), ("98896", "HP:0003236", 0.95), ("98896", "HP:0001638", 0.60), # CF (ORPHA:586) ("586", "HP:0006532", 0.95), ("586", "HP:0001508", 0.75), # PKU (ORPHA:716) ("716", "HP:0001249", 0.85), # Friedreich (ORPHA:95) ("95", "HP:0001251", 0.95), ("95", "HP:0001638", 0.70), # Spherocytosis (ORPHA:822) ("822", "HP:0001873", 0.95), # hemolytic anemia ("822", "HP:0001744", 0.80), ("822", "HP:0001081", 0.50), ] # Disease ↔ Gene DISEASE_GENES = [ ("100", "ATM", "pathogenic"), ("646", "NPC1", "pathogenic"), ("646", "NPC2", "pathogenic"), ("355", "GBA", "pathogenic"), ("324", "GLA", "pathogenic"), ("365", "GAA", "pathogenic"), ("579", "IDUA", "pathogenic"), ("580", "IDS", "pathogenic"), ("70", "SMN1", "pathogenic"), ("905", "ATP7B", "pathogenic"), ("98896", "DMD", "pathogenic"), ("586", "CFTR", "pathogenic"), ("716", "PAH", "pathogenic"), ("558", "FBN1", "pathogenic"), ("636", "NF1", "pathogenic"), ("778", "MECP2", "pathogenic"), ("95", "FXN", "pathogenic"), ("702", "PLP1", "pathogenic"), ("90794", "CYP21A2", "pathogenic"), ("822", "ANK1", "pathogenic"), ("822", "SPTB", "pathogenic"), ("183660", "IL2RG", "pathogenic"), ("183660", "ADA", "pathogenic"), ("183660", "RAG1", "pathogenic"), ] # Synthetic confirmed-dx PatientSpaces (so cohort retrieval surfaces real exemplars) # Each is a confirmed AT/NPC/Gaucher/etc patient with a small HPO panel. SYNTHETIC_PATIENTS = [ {"space_id": "synth-at-001", "orpha": "100", "state": "SP", "hpos": ["HP:0001251", "HP:0001009", "HP:0033106", "HP:0006532"], "age": 8}, {"space_id": "synth-at-002", "orpha": "100", "state": "RJ", "hpos": ["HP:0001251", "HP:0033106", "HP:0002715"], "age": 10}, {"space_id": "synth-at-003", "orpha": "100", "state": "MG", "hpos": ["HP:0001251", "HP:0001009", "HP:0002121", "HP:0006532"], "age": 7}, {"space_id": "synth-at-004", "orpha": "100", "state": "BA", "hpos": ["HP:0001251", "HP:0033106", "HP:0040315"], "age": 14}, {"space_id": "synth-at-005", "orpha": "100", "state": "SP", "hpos": ["HP:0001251", "HP:0001009", "HP:0006530"], "age": 12}, {"space_id": "synth-npc-001", "orpha": "646", "state": "SP", "hpos": ["HP:0001251", "HP:0001744", "HP:0002240"], "age": 9}, {"space_id": "synth-npc-002", "orpha": "646", "state": "RS", "hpos": ["HP:0001251", "HP:0001249", "HP:0000505"], "age": 11}, {"space_id": "synth-gaucher-001", "orpha": "355", "state": "SP", "hpos": ["HP:0002240", "HP:0001744", "HP:0001873"], "age": 6}, {"space_id": "synth-fabry-001", "orpha": "324", "state": "PR", "hpos": ["HP:0001626", "HP:0001638"], "age": 16}, {"space_id": "synth-pompe-001", "orpha": "365", "state": "SP", "hpos": ["HP:0001638", "HP:0008936"], "age": 1}, {"space_id": "synth-sma1-001", "orpha": "70", "state": "RJ", "hpos": ["HP:0008936", "HP:0001284", "HP:0002643"], "age": 1}, {"space_id": "synth-dmd-001", "orpha": "98896", "state": "MG", "hpos": ["HP:0001270", "HP:0003236"], "age": 5}, ] async def _q(cypher, params=None): from space_graph import _safe_query return await _safe_query(cypher, params or {}) async def seed(): print("Seeding Aura with curated rare-disease KG...") # 1. Diseases for d in DISEASES: await _q(""" MERGE (x:Disease {orphaCode: $orpha}) SET x.name = $name, x.cid10DescriptionPt = $name_pt, x.cid10 = $cid10, x.inheritance = $inheritance, x.prevalence = $prevalence, x.summary = $summary """, d) print(f" ✓ {len(DISEASES)} Disease nodes") # 2. Phenotypes (HPO) for hpo, name, name_pt in PHENOTYPES: await _q(""" MERGE (p:Phenotype {hpoId: $hpo}) SET p.name = $name, p.name_pt = $name_pt """, {"hpo": hpo, "name": name, "name_pt": name_pt}) print(f" ✓ {len(PHENOTYPES)} Phenotype nodes") # 3. Genes for sym, loc in GENES: await _q(""" MERGE (g:Gene {symbol: $sym}) SET g.location = $loc """, {"sym": sym, "loc": loc}) print(f" ✓ {len(GENES)} Gene nodes") # 4. Disease ↔ Phenotype for orpha, hpo, freq in DISEASE_PHENOTYPES: await _q(""" MATCH (d:Disease {orphaCode: $orpha}) MATCH (p:Phenotype {hpoId: $hpo}) MERGE (d)-[r:HAS_PHENOTYPE]->(p) SET r.frequency = $freq """, {"orpha": orpha, "hpo": hpo, "freq": freq}) print(f" ✓ {len(DISEASE_PHENOTYPES)} Disease-Phenotype edges") # 5. Disease ↔ Gene for orpha, sym, path in DISEASE_GENES: await _q(""" MATCH (d:Disease {orphaCode: $orpha}) MATCH (g:Gene {symbol: $sym}) MERGE (d)-[r:ASSOCIATED_WITH]->(g) SET r.pathogenicity = $path """, {"orpha": orpha, "sym": sym, "path": path}) print(f" ✓ {len(DISEASE_GENES)} Disease-Gene edges") # 6. Synthetic PatientSpaces (so cohort works) for p in SYNTHETIC_PATIENTS: await _q(""" MERGE (s:PatientSpace {space_id: $space_id}) SET s.state = $state, s.age = $age, s.is_synthetic = true """, {"space_id": p["space_id"], "state": p["state"], "age": p["age"]}) # Confirmed hypothesis pointing at the disease await _q(""" MATCH (s:PatientSpace {space_id: $space_id}) MERGE (h:SpaceHypothesis {hypothesis_id: $space_id + '-h0'}) SET h.status = 'confirmed', h.orpha_code = $orpha, h.probability = 0.95 MERGE (s)-[:HAS_HYPOTHESIS]->(h) """, p) await _q(""" MATCH (h:SpaceHypothesis {hypothesis_id: $space_id + '-h0'}) MATCH (d:Disease {orphaCode: $orpha}) MERGE (h)-[:TARGETS]->(d) """, p) # SpaceEvent → Phenotypes for i, hpo in enumerate(p["hpos"]): eid = f"{p['space_id']}-e{i}" await _q(""" MATCH (s:PatientSpace {space_id: $space_id}) MERGE (e:SpaceEvent {event_id: $eid}) SET e.event_type = 'symptom_onset', e.title = $hpo MERGE (s)-[:HAS_EVENT]->(e) """, {"space_id": p["space_id"], "eid": eid, "hpo": hpo}) await _q(""" MATCH (e:SpaceEvent {event_id: $eid}) MATCH (p:Phenotype {hpoId: $hpo}) MERGE (e)-[:INVOLVES]->(p) """, {"eid": eid, "hpo": hpo}) print(f" ✓ {len(SYNTHETIC_PATIENTS)} synthetic PatientSpaces seeded") # Verify counts = await _q(""" MATCH (n) RETURN labels(n)[0] AS l, count(n) AS c ORDER BY c DESC LIMIT 20 """) print("\nFinal Aura inventory:") for r in counts: print(f" {r['l']}: {r['c']}") if __name__ == "__main__": asyncio.run(seed())