| """Seed an empty Aura instance with the minimal KG Gemeo needs. |
| |
| Without external Disease/Phenotype/Gene nodes, the cohort, subgraph, |
| drug repurposing, family, reverse-pheno, and PCDT-compliance heads all |
| return empty (graceful degrade). This script populates a curated rare- |
| disease subset (~20 diseases, ~80 phenotypes, ~25 genes, plus a few |
| synthetic confirmed-dx PatientSpaces for cohort retrieval) so the SOTA |
| layer can demonstrate non-trivial behaviour. |
| |
| Run: python -m gemeo.seed_aura |
| |
| Idempotent: uses MERGE for every node. Safe to re-run. |
| """ |
| from __future__ import annotations |
| import asyncio |
| import logging |
| import sys |
|
|
| logger = logging.getLogger("gemeo.seed_aura") |
| logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(name)s] %(message)s") |
|
|
|
|
| |
|
|
| DISEASES = [ |
| |
| {"orpha": "100", "name": "Ataxia-telangiectasia", "name_pt": "Ataxia-telangiectasia", |
| "cid10": "G11.3", "inheritance": "autosomal recessive", |
| "prevalence": "1-9/1000000", "summary": "Progressive cerebellar ataxia, immunodeficiency, lymphoid malignancy risk."}, |
| |
| {"orpha": "646", "name": "Niemann-Pick disease type C", "name_pt": "DoenΓ§a de Niemann-Pick tipo C", |
| "cid10": "E75.2", "inheritance": "autosomal recessive", |
| "prevalence": "1-9/100000", "summary": "Lysosomal lipid storage; vertical supranuclear gaze palsy, hepatosplenomegaly."}, |
| |
| {"orpha": "355", "name": "Gaucher disease", "name_pt": "DoenΓ§a de Gaucher", |
| "cid10": "E75.2", "inheritance": "autosomal recessive", |
| "prevalence": "1-9/100000", "summary": "GBA deficiency; hepatosplenomegaly, bone pain, anemia."}, |
| |
| {"orpha": "324", "name": "Fabry disease", "name_pt": "DoenΓ§a de Fabry", |
| "cid10": "E75.2", "inheritance": "X-linked recessive", |
| "prevalence": "1-9/100000", "summary": "GLA deficiency; acroparesthesias, angiokeratomas, renal/cardiac involvement."}, |
| |
| {"orpha": "365", "name": "Glycogen storage disease due to acid maltase deficiency", "name_pt": "DoenΓ§a de Pompe", |
| "cid10": "E74.0", "inheritance": "autosomal recessive", |
| "prevalence": "1-9/100000", "summary": "GAA deficiency; hypotonia, cardiomyopathy in infants."}, |
| |
| {"orpha": "579", "name": "Mucopolysaccharidosis type 1", "name_pt": "MPS I (Hurler-Scheie)", |
| "cid10": "E76.0", "inheritance": "autosomal recessive", |
| "prevalence": "1-9/100000", "summary": "IDUA deficiency; coarse facies, dysostosis multiplex."}, |
| |
| {"orpha": "70", "name": "Proximal spinal muscular atrophy type 1", "name_pt": "Atrofia Muscular Espinhal tipo 1", |
| "cid10": "G12.0", "inheritance": "autosomal recessive", |
| "prevalence": "1-9/100000", "summary": "SMN1 loss; severe infantile hypotonia, respiratory failure."}, |
| |
| {"orpha": "905", "name": "Wilson disease", "name_pt": "DoenΓ§a de Wilson", |
| "cid10": "E83.0", "inheritance": "autosomal recessive", |
| "prevalence": "1-9/100000", "summary": "ATP7B deficiency; hepatic, neurologic, Kayser-Fleischer rings."}, |
| |
| {"orpha": "98896", "name": "Duchenne muscular dystrophy", "name_pt": "Distrofia muscular de Duchenne", |
| "cid10": "G71.0", "inheritance": "X-linked recessive", |
| "prevalence": "1-9/100000", "summary": "DMD gene mutation; progressive muscle weakness, Gowers sign."}, |
| |
| {"orpha": "586", "name": "Cystic fibrosis", "name_pt": "Fibrose cΓstica", |
| "cid10": "E84", "inheritance": "autosomal recessive", |
| "prevalence": "1-9/100000", "summary": "CFTR; recurrent infections, pancreatic insufficiency."}, |
| |
| {"orpha": "716", "name": "Phenylketonuria", "name_pt": "FenilcetonΓΊria", |
| "cid10": "E70.0", "inheritance": "autosomal recessive", |
| "prevalence": "1-9/100000", "summary": "PAH deficiency; intellectual disability if untreated."}, |
| |
| {"orpha": "558", "name": "Marfan syndrome", "name_pt": "SΓndrome de Marfan", |
| "cid10": "Q87.4", "inheritance": "autosomal dominant", |
| "prevalence": "1-9/100000", "summary": "FBN1; aortic dilation, ectopia lentis, tall stature."}, |
| |
| {"orpha": "636", "name": "Neurofibromatosis type 1", "name_pt": "Neurofibromatose tipo 1", |
| "cid10": "Q85.0", "inheritance": "autosomal dominant", |
| "prevalence": "1-9/10000", "summary": "NF1; cafΓ©-au-lait, neurofibromas."}, |
| |
| {"orpha": "778", "name": "Rett syndrome", "name_pt": "SΓndrome de Rett", |
| "cid10": "F84.2", "inheritance": "X-linked dominant", |
| "prevalence": "1-9/100000", "summary": "MECP2; regression, stereotypies, mostly females."}, |
| |
| {"orpha": "95", "name": "Friedreich ataxia", "name_pt": "Ataxia de Friedreich", |
| "cid10": "G11.1", "inheritance": "autosomal recessive", |
| "prevalence": "1-9/100000", "summary": "FXN GAA expansion; gait ataxia, cardiomyopathy, diabetes."}, |
| |
| {"orpha": "702", "name": "Pelizaeus-Merzbacher disease", "name_pt": "DoenΓ§a de Pelizaeus-Merzbacher", |
| "cid10": "E75.2", "inheritance": "X-linked recessive", |
| "prevalence": "1-9/1000000", "summary": "PLP1; nystagmus, hypotonia, leukodystrophy."}, |
| |
| {"orpha": "580", "name": "Mucopolysaccharidosis type 2", "name_pt": "MPS II (Hunter)", |
| "cid10": "E76.1", "inheritance": "X-linked recessive", |
| "prevalence": "1-9/100000", "summary": "IDS deficiency; coarse facies, hepatosplenomegaly."}, |
| |
| {"orpha": "183660", "name": "Severe combined immunodeficiency", "name_pt": "ImunodeficiΓͺncia combinada grave", |
| "cid10": "D81.1", "inheritance": "various", |
| "prevalence": "1-9/100000", "summary": "Multiple genes; severe T/B/NK deficiency, opportunistic infections."}, |
| |
| {"orpha": "90794", "name": "Congenital adrenal hyperplasia due to 21-hydroxylase deficiency", "name_pt": "Hiperplasia adrenal congΓͺnita", |
| "cid10": "E25.0", "inheritance": "autosomal recessive", |
| "prevalence": "1-9/100000", "summary": "CYP21A2; salt-wasting, virilization."}, |
| |
| {"orpha": "822", "name": "Hereditary spherocytosis", "name_pt": "Esferocitose hereditΓ‘ria", |
| "cid10": "D58.0", "inheritance": "autosomal dominant", |
| "prevalence": "1-9/10000", "summary": "Membrane defect; hemolysis, splenomegaly, jaundice."}, |
| ] |
|
|
| |
| PHENOTYPES = [ |
| ("HP:0001250", "Seizure", "ConvulsΓ£o"), |
| ("HP:0001251", "Ataxia", "Ataxia"), |
| ("HP:0001260", "Dysarthria", "Disartria"), |
| ("HP:0001009", "Telangiectasia", "Telangiectasia"), |
| ("HP:0033106", "Increased serum alpha-fetoprotein", "AFP sΓ©rica elevada"), |
| ("HP:0002720", "Decreased circulating IgA level", "IgA baixa"), |
| ("HP:0006532", "Recurrent pneumonia", "Pneumonia recorrente"), |
| ("HP:0002121", "Cerebellar atrophy", "Atrofia cerebelar"), |
| ("HP:0000639", "Nystagmus", "Nistagmo"), |
| ("HP:0001270", "Motor delay", "Atraso motor"), |
| ("HP:0001249", "Intellectual disability", "DeficiΓͺncia intelectual"), |
| ("HP:0002240", "Hepatomegaly", "Hepatomegalia"), |
| ("HP:0001744", "Splenomegaly", "Esplenomegalia"), |
| ("HP:0000505", "Visual impairment", "Comprometimento visual"), |
| ("HP:0001392", "Abnormal liver", "AlteraΓ§Γ£o hepΓ‘tica"), |
| ("HP:0001627", "Abnormal heart morphology", "AlteraΓ§Γ£o cardΓaca"), |
| ("HP:0001638", "Cardiomyopathy", "Cardiomiopatia"), |
| ("HP:0002715", "Abnormality of the immune system", "ImunodeficiΓͺncia"), |
| ("HP:0007354", "Amyotrophic lateral sclerosis-like", "SΓndrome ELA-like"), |
| ("HP:0002493", "Upper motor neuron dysfunction", "DisfunΓ§Γ£o do neurΓ΄nio motor superior"), |
| ("HP:0001257", "Spasticity", "Espasticidade"), |
| ("HP:0001284", "Areflexia", "Arreflexia"), |
| ("HP:0008936", "Muscular hypotonia of the trunk", "Hipotonia troncular"), |
| ("HP:0003236", "Elevated circulating creatine kinase", "CK elevada"), |
| ("HP:0001626", "Abnormal vasculature", "AlteraΓ§Γ£o vascular"), |
| ("HP:0001392", "Liver dysfunction", "DisfunΓ§Γ£o hepΓ‘tica"), |
| ("HP:0002910", "Elevated hepatic transaminase", "Transaminases elevadas"), |
| ("HP:0007360", "Aplasia/Hypoplasia of the cerebellum", "Hipoplasia cerebelar"), |
| ("HP:0001263", "Global developmental delay", "Atraso global do desenvolvimento"), |
| ("HP:0000252", "Microcephaly", "Microcefalia"), |
| ("HP:0001876", "Pancytopenia", "Pancitopenia"), |
| ("HP:0001873", "Thrombocytopenia", "Plaquetopenia"), |
| ("HP:0001508", "Failure to thrive", "Baixo ganho ponderal"), |
| ("HP:0002013", "Vomiting", "VΓ΄mitos"), |
| ("HP:0002643", "Neonatal respiratory distress", "InsuficiΓͺncia respiratΓ³ria neonatal"), |
| ("HP:0011675", "Arrhythmia", "Arritmia"), |
| ("HP:0001417", "X-linked inheritance", "HeranΓ§a ligada ao X"), |
| ("HP:0001425", "Heterogeneous", "HeterogΓͺneo"), |
| ("HP:0010972", "Anemia of inadequate production", "Anemia hipoproliferativa"), |
| ("HP:0001873", "Hemolytic anemia", "Anemia hemolΓtica"), |
| ("HP:0001081", "Cholelithiasis", "ColelitΓase"), |
| ("HP:0006530", "Interstitial pulmonary disease", "Pneumopatia intersticial"), |
| ("HP:0010301", "Spinal cord compression", "CompressΓ£o medular"), |
| ("HP:0040315", "B-cell lymphoma", "Linfoma de cΓ©lulas B"), |
| ("HP:0010785", "T-cell lymphoma", "Linfoma de cΓ©lulas T"), |
| ] |
|
|
| |
| GENES = [ |
| ("ATM", "11q22.3"), |
| ("NPC1", "18q11.2"), |
| ("NPC2", "14q24.3"), |
| ("GBA", "1q22"), |
| ("GLA", "Xq22.1"), |
| ("GAA", "17q25.3"), |
| ("IDUA", "4p16.3"), |
| ("IDS", "Xq28"), |
| ("SMN1", "5q13.2"), |
| ("SMN2", "5q13.2"), |
| ("ATP7B", "13q14.3"), |
| ("DMD", "Xp21.2"), |
| ("CFTR", "7q31.2"), |
| ("PAH", "12q23.2"), |
| ("FBN1", "15q21.1"), |
| ("NF1", "17q11.2"), |
| ("MECP2", "Xq28"), |
| ("FXN", "9q21.11"), |
| ("PLP1", "Xq22.2"), |
| ("CYP21A2", "6p21.33"), |
| ("ANK1", "8p11.21"), |
| ("SPTB", "14q23.3"), |
| ("IL2RG", "Xq13.1"), |
| ("ADA", "20q13.12"), |
| ("RAG1", "11p12"), |
| ("RAG2", "11p12"), |
| ] |
|
|
| |
| DISEASE_PHENOTYPES = [ |
| |
| ("100", "HP:0001251", 0.95), |
| ("100", "HP:0001009", 0.85), |
| ("100", "HP:0033106", 0.95), |
| ("100", "HP:0002720", 0.80), |
| ("100", "HP:0006532", 0.85), |
| ("100", "HP:0002121", 0.90), |
| ("100", "HP:0001260", 0.80), |
| ("100", "HP:0002715", 0.85), |
| ("100", "HP:0040315", 0.25), |
| ("100", "HP:0010785", 0.20), |
| ("100", "HP:0006530", 0.50), |
| |
| ("646", "HP:0001251", 0.85), |
| ("646", "HP:0002240", 0.70), |
| ("646", "HP:0001744", 0.80), |
| ("646", "HP:0001249", 0.75), |
| ("646", "HP:0001260", 0.60), |
| ("646", "HP:0000505", 0.70), |
| |
| ("355", "HP:0002240", 0.85), |
| ("355", "HP:0001744", 0.95), |
| ("355", "HP:0001873", 0.70), |
| |
| ("324", "HP:0001626", 0.80), |
| ("324", "HP:0001638", 0.70), |
| |
| ("365", "HP:0001638", 0.95), |
| ("365", "HP:0008936", 0.95), |
| ("365", "HP:0003236", 0.90), |
| |
| ("579", "HP:0002240", 0.85), |
| ("579", "HP:0001744", 0.70), |
| |
| ("70", "HP:0008936", 0.99), |
| ("70", "HP:0001284", 0.90), |
| ("70", "HP:0001270", 0.99), |
| |
| ("905", "HP:0002910", 0.85), |
| ("905", "HP:0001392", 0.90), |
| |
| ("98896", "HP:0001270", 0.95), |
| ("98896", "HP:0003236", 0.95), |
| ("98896", "HP:0001638", 0.60), |
| |
| ("586", "HP:0006532", 0.95), |
| ("586", "HP:0001508", 0.75), |
| |
| ("716", "HP:0001249", 0.85), |
| |
| ("95", "HP:0001251", 0.95), |
| ("95", "HP:0001638", 0.70), |
| |
| ("822", "HP:0001873", 0.95), |
| ("822", "HP:0001744", 0.80), |
| ("822", "HP:0001081", 0.50), |
| ] |
|
|
| |
| DISEASE_GENES = [ |
| ("100", "ATM", "pathogenic"), |
| ("646", "NPC1", "pathogenic"), |
| ("646", "NPC2", "pathogenic"), |
| ("355", "GBA", "pathogenic"), |
| ("324", "GLA", "pathogenic"), |
| ("365", "GAA", "pathogenic"), |
| ("579", "IDUA", "pathogenic"), |
| ("580", "IDS", "pathogenic"), |
| ("70", "SMN1", "pathogenic"), |
| ("905", "ATP7B", "pathogenic"), |
| ("98896", "DMD", "pathogenic"), |
| ("586", "CFTR", "pathogenic"), |
| ("716", "PAH", "pathogenic"), |
| ("558", "FBN1", "pathogenic"), |
| ("636", "NF1", "pathogenic"), |
| ("778", "MECP2", "pathogenic"), |
| ("95", "FXN", "pathogenic"), |
| ("702", "PLP1", "pathogenic"), |
| ("90794", "CYP21A2", "pathogenic"), |
| ("822", "ANK1", "pathogenic"), |
| ("822", "SPTB", "pathogenic"), |
| ("183660", "IL2RG", "pathogenic"), |
| ("183660", "ADA", "pathogenic"), |
| ("183660", "RAG1", "pathogenic"), |
| ] |
|
|
| |
| |
| SYNTHETIC_PATIENTS = [ |
| {"space_id": "synth-at-001", "orpha": "100", "state": "SP", "hpos": ["HP:0001251", "HP:0001009", "HP:0033106", "HP:0006532"], "age": 8}, |
| {"space_id": "synth-at-002", "orpha": "100", "state": "RJ", "hpos": ["HP:0001251", "HP:0033106", "HP:0002715"], "age": 10}, |
| {"space_id": "synth-at-003", "orpha": "100", "state": "MG", "hpos": ["HP:0001251", "HP:0001009", "HP:0002121", "HP:0006532"], "age": 7}, |
| {"space_id": "synth-at-004", "orpha": "100", "state": "BA", "hpos": ["HP:0001251", "HP:0033106", "HP:0040315"], "age": 14}, |
| {"space_id": "synth-at-005", "orpha": "100", "state": "SP", "hpos": ["HP:0001251", "HP:0001009", "HP:0006530"], "age": 12}, |
| {"space_id": "synth-npc-001", "orpha": "646", "state": "SP", "hpos": ["HP:0001251", "HP:0001744", "HP:0002240"], "age": 9}, |
| {"space_id": "synth-npc-002", "orpha": "646", "state": "RS", "hpos": ["HP:0001251", "HP:0001249", "HP:0000505"], "age": 11}, |
| {"space_id": "synth-gaucher-001", "orpha": "355", "state": "SP", "hpos": ["HP:0002240", "HP:0001744", "HP:0001873"], "age": 6}, |
| {"space_id": "synth-fabry-001", "orpha": "324", "state": "PR", "hpos": ["HP:0001626", "HP:0001638"], "age": 16}, |
| {"space_id": "synth-pompe-001", "orpha": "365", "state": "SP", "hpos": ["HP:0001638", "HP:0008936"], "age": 1}, |
| {"space_id": "synth-sma1-001", "orpha": "70", "state": "RJ", "hpos": ["HP:0008936", "HP:0001284", "HP:0002643"], "age": 1}, |
| {"space_id": "synth-dmd-001", "orpha": "98896", "state": "MG", "hpos": ["HP:0001270", "HP:0003236"], "age": 5}, |
| ] |
|
|
|
|
| async def _q(cypher, params=None): |
| from space_graph import _safe_query |
| return await _safe_query(cypher, params or {}) |
|
|
|
|
| async def seed(): |
| print("Seeding Aura with curated rare-disease KG...") |
|
|
| |
| for d in DISEASES: |
| await _q(""" |
| MERGE (x:Disease {orphaCode: $orpha}) |
| SET x.name = $name, |
| x.cid10DescriptionPt = $name_pt, |
| x.cid10 = $cid10, |
| x.inheritance = $inheritance, |
| x.prevalence = $prevalence, |
| x.summary = $summary |
| """, d) |
| print(f" β {len(DISEASES)} Disease nodes") |
|
|
| |
| for hpo, name, name_pt in PHENOTYPES: |
| await _q(""" |
| MERGE (p:Phenotype {hpoId: $hpo}) |
| SET p.name = $name, p.name_pt = $name_pt |
| """, {"hpo": hpo, "name": name, "name_pt": name_pt}) |
| print(f" β {len(PHENOTYPES)} Phenotype nodes") |
|
|
| |
| for sym, loc in GENES: |
| await _q(""" |
| MERGE (g:Gene {symbol: $sym}) |
| SET g.location = $loc |
| """, {"sym": sym, "loc": loc}) |
| print(f" β {len(GENES)} Gene nodes") |
|
|
| |
| for orpha, hpo, freq in DISEASE_PHENOTYPES: |
| await _q(""" |
| MATCH (d:Disease {orphaCode: $orpha}) |
| MATCH (p:Phenotype {hpoId: $hpo}) |
| MERGE (d)-[r:HAS_PHENOTYPE]->(p) |
| SET r.frequency = $freq |
| """, {"orpha": orpha, "hpo": hpo, "freq": freq}) |
| print(f" β {len(DISEASE_PHENOTYPES)} Disease-Phenotype edges") |
|
|
| |
| for orpha, sym, path in DISEASE_GENES: |
| await _q(""" |
| MATCH (d:Disease {orphaCode: $orpha}) |
| MATCH (g:Gene {symbol: $sym}) |
| MERGE (d)-[r:ASSOCIATED_WITH]->(g) |
| SET r.pathogenicity = $path |
| """, {"orpha": orpha, "sym": sym, "path": path}) |
| print(f" β {len(DISEASE_GENES)} Disease-Gene edges") |
|
|
| |
| for p in SYNTHETIC_PATIENTS: |
| await _q(""" |
| MERGE (s:PatientSpace {space_id: $space_id}) |
| SET s.state = $state, s.age = $age, s.is_synthetic = true |
| """, {"space_id": p["space_id"], "state": p["state"], "age": p["age"]}) |
|
|
| |
| await _q(""" |
| MATCH (s:PatientSpace {space_id: $space_id}) |
| MERGE (h:SpaceHypothesis {hypothesis_id: $space_id + '-h0'}) |
| SET h.status = 'confirmed', h.orpha_code = $orpha, h.probability = 0.95 |
| MERGE (s)-[:HAS_HYPOTHESIS]->(h) |
| """, p) |
| await _q(""" |
| MATCH (h:SpaceHypothesis {hypothesis_id: $space_id + '-h0'}) |
| MATCH (d:Disease {orphaCode: $orpha}) |
| MERGE (h)-[:TARGETS]->(d) |
| """, p) |
|
|
| |
| for i, hpo in enumerate(p["hpos"]): |
| eid = f"{p['space_id']}-e{i}" |
| await _q(""" |
| MATCH (s:PatientSpace {space_id: $space_id}) |
| MERGE (e:SpaceEvent {event_id: $eid}) |
| SET e.event_type = 'symptom_onset', e.title = $hpo |
| MERGE (s)-[:HAS_EVENT]->(e) |
| """, {"space_id": p["space_id"], "eid": eid, "hpo": hpo}) |
| await _q(""" |
| MATCH (e:SpaceEvent {event_id: $eid}) |
| MATCH (p:Phenotype {hpoId: $hpo}) |
| MERGE (e)-[:INVOLVES]->(p) |
| """, {"eid": eid, "hpo": hpo}) |
| print(f" β {len(SYNTHETIC_PATIENTS)} synthetic PatientSpaces seeded") |
|
|
| |
| counts = await _q(""" |
| MATCH (n) RETURN labels(n)[0] AS l, count(n) AS c |
| ORDER BY c DESC LIMIT 20 |
| """) |
| print("\nFinal Aura inventory:") |
| for r in counts: |
| print(f" {r['l']}: {r['c']}") |
|
|
|
|
| if __name__ == "__main__": |
| asyncio.run(seed()) |
|
|