CTA / backend /data_ingestion.py
TheQuantEd's picture
Initial deployment: ClinicalMatch AI v2.0 — FHIR R4 · MCP (9 tools) · A2A workflow · SHARP compliance · 100k synthetic patients · Neo4j graph · GraphRAG chatbot
59abb4f
from neo4j_setup import neo4j_conn
def ingest_sample_data():
"""Ingest rich sample data into Neo4j knowledge graph."""
# Clear existing sample data
neo4j_conn.run_query("MATCH (n) WHERE n.sample = true DETACH DELETE n")
queries = [
# Patients with rich profiles
"""
MERGE (p1:Patient {id: 'P001'})
SET p1 += {age: 45, gender: 'female', ethnicity: 'White', sample: true,
zip_code: '02115', diagnosis_date: '2022-06-01'}
""",
"""
MERGE (p2:Patient {id: 'P002'})
SET p2 += {age: 60, gender: 'male', ethnicity: 'Black/African American', sample: true,
zip_code: '77030', diagnosis_date: '2021-11-15'}
""",
"""
MERGE (p3:Patient {id: 'P003'})
SET p3 += {age: 38, gender: 'female', ethnicity: 'Hispanic/Latino', sample: true,
zip_code: '94102', diagnosis_date: '2023-02-10'}
""",
"""
MERGE (p4:Patient {id: 'P004'})
SET p4 += {age: 67, gender: 'male', ethnicity: 'Asian', sample: true,
zip_code: '10001', diagnosis_date: '2022-09-20'}
""",
"""
MERGE (p5:Patient {id: 'P005'})
SET p5 += {age: 34, gender: 'female', ethnicity: 'White', sample: true,
zip_code: '60601', diagnosis_date: '2023-07-01'}
""",
# Diagnoses
"""MERGE (d1:Diagnosis {code: 'C50'}) SET d1.name = 'Breast Cancer', d1.snomed = '254837009'""",
"""MERGE (d2:Diagnosis {code: 'C61'}) SET d2.name = 'Prostate Cancer', d2.snomed = '399068003'""",
"""MERGE (d3:Diagnosis {code: 'C34'}) SET d3.name = 'Non-Small Cell Lung Cancer', d3.snomed = '363346000'""",
"""MERGE (d4:Diagnosis {code: 'C18'}) SET d4.name = 'Colorectal Cancer', d4.snomed = '93761005'""",
# Biomarkers
"""MERGE (b1:Biomarker {id: 'HER2_POS'}) SET b1.name = 'HER2 Positive', b1.loinc = '85319-2'""",
"""MERGE (b2:Biomarker {id: 'EGFR_L858R'}) SET b2.name = 'EGFR L858R Mutation', b2.loinc = '81704-9'""",
"""MERGE (b3:Biomarker {id: 'BRCA2_POS'}) SET b3.name = 'BRCA2 Mutation', b3.loinc = '85319-2'""",
"""MERGE (b4:Biomarker {id: 'MSI_H'}) SET b4.name = 'MSI-High', b4.loinc = '85077-6'""",
"""MERGE (b5:Biomarker {id: 'PDL1_HIGH'}) SET b5.name = 'PD-L1 High (>50%)', b5.loinc = '73977-1'""",
# Trials
"""
MERGE (t1:Trial {id: 'NCT04889131'})
SET t1 += {phase: 'PHASE2', condition: 'Breast Cancer', status: 'RECRUITING',
title: 'Precision HER2+ Breast Cancer Study', min_age: 18, max_age: 75,
enrollment_target: 150, enrolled: 87, sponsor: 'Dana-Farber'}
""",
"""
MERGE (t2:Trial {id: 'NCT05123456'})
SET t2 += {phase: 'PHASE3', condition: 'Breast Cancer', status: 'RECRUITING',
title: 'Immunotherapy Combination for Advanced Breast Cancer', min_age: 18,
enrollment_target: 400, enrolled: 142, sponsor: 'Pharma Innovations Inc'}
""",
"""
MERGE (t3:Trial {id: 'NCT05456789'})
SET t3 += {phase: 'PHASE2', condition: 'Prostate Cancer', status: 'RECRUITING',
title: 'BRCA2 Prostate Cancer PARP Inhibitor Trial', min_age: 18,
enrollment_target: 120, enrolled: 54, sponsor: 'Oncology Research Group'}
""",
"""
MERGE (t4:Trial {id: 'NCT06112233'})
SET t4 += {phase: 'PHASE3', condition: 'Non-Small Cell Lung Cancer', status: 'RECRUITING',
title: 'EGFR-Mutant NSCLC Targeted Therapy Study', min_age: 18,
enrollment_target: 300, enrolled: 178, sponsor: 'Global Cancer Institute'}
""",
"""
MERGE (t5:Trial {id: 'NCT05334455'})
SET t5 += {phase: 'PHASE2', condition: 'Colorectal Cancer', status: 'RECRUITING',
title: 'MSI-H Colorectal Cancer Immunotherapy Study', min_age: 18,
enrollment_target: 100, enrolled: 45, sponsor: 'NCI'}
""",
# Study Sites
"""
MERGE (s1:StudySite {id: 'DFCI'})
SET s1 += {name: 'Dana-Farber Cancer Institute', city: 'Boston', state: 'MA',
lat: 42.3376, lon: -71.1083, active_trials: 4}
""",
"""
MERGE (s2:StudySite {id: 'MDACC'})
SET s2 += {name: 'MD Anderson Cancer Center', city: 'Houston', state: 'TX',
lat: 29.7066, lon: -95.3990, active_trials: 6}
""",
"""
MERGE (s3:StudySite {id: 'MSK'})
SET s3 += {name: 'Memorial Sloan Kettering', city: 'New York', state: 'NY',
lat: 40.7644, lon: -73.9581, active_trials: 5}
""",
# Patient-Diagnosis relationships
"""MATCH (p:Patient {id: 'P001'}), (d:Diagnosis {code: 'C50'}) MERGE (p)-[:HAS_DIAGNOSIS]->(d)""",
"""MATCH (p:Patient {id: 'P002'}), (d:Diagnosis {code: 'C61'}) MERGE (p)-[:HAS_DIAGNOSIS]->(d)""",
"""MATCH (p:Patient {id: 'P003'}), (d:Diagnosis {code: 'C50'}) MERGE (p)-[:HAS_DIAGNOSIS]->(d)""",
"""MATCH (p:Patient {id: 'P004'}), (d:Diagnosis {code: 'C34'}) MERGE (p)-[:HAS_DIAGNOSIS]->(d)""",
"""MATCH (p:Patient {id: 'P005'}), (d:Diagnosis {code: 'C18'}) MERGE (p)-[:HAS_DIAGNOSIS]->(d)""",
# Patient-Biomarker relationships
"""MATCH (p:Patient {id: 'P001'}), (b:Biomarker {id: 'HER2_POS'}) MERGE (p)-[:HAS_BIOMARKER]->(b)""",
"""MATCH (p:Patient {id: 'P002'}), (b:Biomarker {id: 'BRCA2_POS'}) MERGE (p)-[:HAS_BIOMARKER]->(b)""",
"""MATCH (p:Patient {id: 'P004'}), (b:Biomarker {id: 'EGFR_L858R'}) MERGE (p)-[:HAS_BIOMARKER]->(b)""",
"""MATCH (p:Patient {id: 'P004'}), (b:Biomarker {id: 'PDL1_HIGH'}) MERGE (p)-[:HAS_BIOMARKER]->(b)""",
"""MATCH (p:Patient {id: 'P005'}), (b:Biomarker {id: 'MSI_H'}) MERGE (p)-[:HAS_BIOMARKER]->(b)""",
# Diagnosis-Trial eligibility
"""MATCH (d:Diagnosis {code: 'C50'}), (t:Trial {id: 'NCT04889131'}) MERGE (d)-[:ELIGIBLE_FOR]->(t)""",
"""MATCH (d:Diagnosis {code: 'C50'}), (t:Trial {id: 'NCT05123456'}) MERGE (d)-[:ELIGIBLE_FOR]->(t)""",
"""MATCH (d:Diagnosis {code: 'C61'}), (t:Trial {id: 'NCT05456789'}) MERGE (d)-[:ELIGIBLE_FOR]->(t)""",
"""MATCH (d:Diagnosis {code: 'C34'}), (t:Trial {id: 'NCT06112233'}) MERGE (d)-[:ELIGIBLE_FOR]->(t)""",
"""MATCH (d:Diagnosis {code: 'C18'}), (t:Trial {id: 'NCT05334455'}) MERGE (d)-[:ELIGIBLE_FOR]->(t)""",
# Trial-Site relationships
"""MATCH (t:Trial {id: 'NCT04889131'}), (s:StudySite {id: 'DFCI'}) MERGE (t)-[:CONDUCTED_AT]->(s)""",
"""MATCH (t:Trial {id: 'NCT04889131'}), (s:StudySite {id: 'MSK'}) MERGE (t)-[:CONDUCTED_AT]->(s)""",
"""MATCH (t:Trial {id: 'NCT05123456'}), (s:StudySite {id: 'MDACC'}) MERGE (t)-[:CONDUCTED_AT]->(s)""",
"""MATCH (t:Trial {id: 'NCT05123456'}), (s:StudySite {id: 'MSK'}) MERGE (t)-[:CONDUCTED_AT]->(s)""",
"""MATCH (t:Trial {id: 'NCT05456789'}), (s:StudySite {id: 'MDACC'}) MERGE (t)-[:CONDUCTED_AT]->(s)""",
# Biomarker-Trial requirements
"""MATCH (b:Biomarker {id: 'HER2_POS'}), (t:Trial {id: 'NCT04889131'}) MERGE (b)-[:REQUIRED_FOR]->(t)""",
"""MATCH (b:Biomarker {id: 'EGFR_L858R'}), (t:Trial {id: 'NCT06112233'}) MERGE (b)-[:REQUIRED_FOR]->(t)""",
"""MATCH (b:Biomarker {id: 'MSI_H'}), (t:Trial {id: 'NCT05334455'}) MERGE (b)-[:REQUIRED_FOR]->(t)""",
]
for query in queries:
try:
neo4j_conn.run_query(query)
except Exception as e:
print(f"Ingestion warning: {e}")
print("Rich sample data ingested successfully.")
if __name__ == "__main__":
ingest_sample_data()
neo4j_conn.close()