Spaces:
Running
Running
Initial deployment: ClinicalMatch AI v2.0 β FHIR R4 Β· MCP (9 tools) Β· A2A workflow Β· SHARP compliance Β· 100k synthetic patients Β· Neo4j graph Β· GraphRAG chatbot
59abb4f | """ | |
| Graph seeder β fetches REAL data from live public APIs and populates Neo4j. | |
| Data sources (all free, no auth): | |
| - ClinicalTrials.gov v2 API (NCT trial records) | |
| - RxNorm (NIH) (medication RxCUI codes) | |
| - ICD-10 CM (NLM) (diagnosis codes) | |
| - PubMed (NCBI) (supporting literature PMIDs) | |
| - Synthetic patients (500 realistic profiles matched to real trials) | |
| Run once to seed, or schedule periodically to stay current. | |
| """ | |
| import httpx | |
| import asyncio | |
| import time | |
| import random | |
| from neo4j_setup import neo4j_conn | |
| CTGOV_BASE = "https://clinicaltrials.gov/api/v2/studies" | |
| RXNORM_BASE = "https://rxnav.nlm.nih.gov/REST" | |
| ICD10_BASE = "https://clinicaltables.nlm.nih.gov/api/icd10cm/v3/search" | |
| PUBMED_BASE = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils" | |
| FDA_BASE = "https://api.fda.gov/drug" | |
| # Conditions to seed β expand as needed | |
| SEED_CONDITIONS = [ | |
| "breast cancer", | |
| "prostate cancer", | |
| "non-small cell lung cancer", | |
| "colorectal cancer", | |
| "ovarian cancer", | |
| "melanoma", | |
| "leukemia", | |
| "lymphoma", | |
| "glioblastoma", | |
| "pancreatic cancer", | |
| ] | |
| # Key oncology medications to pre-load | |
| SEED_MEDICATIONS = [ | |
| "trastuzumab", "pembrolizumab", "nivolumab", "osimertinib", | |
| "olaparib", "enzalutamide", "bevacizumab", "rituximab", | |
| "imatinib", "dabrafenib", "vemurafenib", "atezolizumab", | |
| "durvalumab", "cetuximab", "erlotinib", "capecitabine", | |
| ] | |
| # ICD-10 prefixes for oncology | |
| SEED_ICD10_PREFIXES = [ | |
| "C50", "C61", "C34", "C18", "C56", "C43", "C91", "C85", "C71", "C25", | |
| ] | |
| # ββ Neo4j helpers βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def upsert(query: str, params: dict | None = None): | |
| try: | |
| neo4j_conn.run_query(query, params or {}) | |
| except Exception as e: | |
| print(f" [neo4j] warn: {e}") | |
| def batch_upsert(queries: list[tuple[str, dict]]): | |
| for q, p in queries: | |
| upsert(q, p) | |
| # ββ ClinicalTrials.gov ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| async def fetch_trials_for_condition(client: httpx.AsyncClient, condition: str, page_size: int = 50) -> list[dict]: | |
| try: | |
| resp = await client.get(CTGOV_BASE, params={ | |
| "query.cond": condition, | |
| "filter.overallStatus": "RECRUITING", | |
| "pageSize": page_size, | |
| "format": "json", | |
| }, timeout=30) | |
| resp.raise_for_status() | |
| return resp.json().get("studies", []) | |
| except Exception as e: | |
| print(f" [ctgov] error for '{condition}': {e}") | |
| return [] | |
| def _extract_trial(study: dict, condition: str) -> dict | None: | |
| try: | |
| proto = study["protocolSection"] | |
| ident = proto["identificationModule"] | |
| status = proto.get("statusModule", {}) | |
| design = proto.get("designModule", {}) | |
| eligibility = proto.get("eligibilityModule", {}) | |
| desc = proto.get("descriptionModule", {}) | |
| sponsor = proto.get("sponsorCollaboratorsModule", {}) | |
| contacts = proto.get("contactsLocationsModule", {}) | |
| outcomes = proto.get("outcomesModule", {}) | |
| phases = design.get("phases", ["N/A"]) | |
| locations = contacts.get("locations", []) | |
| return { | |
| "nct_id": ident["nctId"], | |
| "title": ident.get("briefTitle", "")[:200], | |
| "status": status.get("overallStatus", "UNKNOWN"), | |
| "phase": phases[0] if phases else "N/A", | |
| "condition": condition, | |
| "brief_summary": desc.get("briefSummary", "")[:1000], | |
| "eligibility_criteria": eligibility.get("eligibilityCriteria", "")[:2000], | |
| "min_age": eligibility.get("minimumAge", ""), | |
| "max_age": eligibility.get("maximumAge", ""), | |
| "sex": eligibility.get("sex", "ALL"), | |
| "enrollment": design.get("enrollmentInfo", {}).get("count", 0), | |
| "start_date": status.get("startDateStruct", {}).get("date", ""), | |
| "completion_date": status.get("completionDateStruct", {}).get("date", ""), | |
| "sponsor": sponsor.get("leadSponsor", {}).get("name", "")[:100], | |
| "primary_outcomes": [o.get("measure", "")[:100] for o in outcomes.get("primaryOutcomes", [])[:3]], | |
| "location_count": len(locations), | |
| "locations": [ | |
| { | |
| "facility": loc.get("facility", "")[:100], | |
| "city": loc.get("city", ""), | |
| "state": loc.get("state", ""), | |
| "country": loc.get("country", "US"), | |
| "lat": loc.get("geoPoint", {}).get("lat"), | |
| "lon": loc.get("geoPoint", {}).get("lon"), | |
| } | |
| for loc in locations[:10] | |
| ], | |
| } | |
| except Exception as e: | |
| return None | |
| async def seed_trials(client: httpx.AsyncClient) -> int: | |
| print("\n[1/5] Seeding clinical trials from ClinicalTrials.gov...") | |
| total = 0 | |
| for condition in SEED_CONDITIONS: | |
| studies = await fetch_trials_for_condition(client, condition) | |
| print(f" {condition}: {len(studies)} trials fetched") | |
| for study in studies: | |
| trial = _extract_trial(study, condition) | |
| if not trial: | |
| continue | |
| # Upsert trial node | |
| upsert(""" | |
| MERGE (t:Trial {id: $nct_id}) | |
| SET t += { | |
| title: $title, status: $status, phase: $phase, | |
| condition: $condition, brief_summary: $brief_summary, | |
| eligibility_criteria: $eligibility_criteria, | |
| min_age: $min_age, max_age: $max_age, sex: $sex, | |
| enrollment: $enrollment, start_date: $start_date, | |
| completion_date: $completion_date, sponsor: $sponsor, | |
| location_count: $location_count, source: 'clinicaltrials.gov', | |
| updated_at: datetime() | |
| } | |
| """, trial) | |
| # Upsert Condition β Trial relationship | |
| upsert(""" | |
| MERGE (c:ConditionNode {name: $condition}) | |
| WITH c | |
| MATCH (t:Trial {id: $nct_id}) | |
| MERGE (c)-[:HAS_TRIAL]->(t) | |
| """, {"condition": condition, "nct_id": trial["nct_id"]}) | |
| # Upsert study sites | |
| for loc in trial["locations"]: | |
| if loc.get("lat") and loc.get("lon"): | |
| upsert(""" | |
| MERGE (s:StudySite {facility: $facility, city: $city, state: $state}) | |
| SET s += {country: $country, lat: $lat, lon: $lon, source: 'clinicaltrials.gov'} | |
| WITH s | |
| MATCH (t:Trial {id: $nct_id}) | |
| MERGE (t)-[:CONDUCTED_AT]->(s) | |
| """, {**loc, "nct_id": trial["nct_id"]}) | |
| total += 1 | |
| await asyncio.sleep(0.5) # Rate limit courtesy | |
| print(f" Total trials seeded: {total}") | |
| return total | |
| # ββ RxNorm (NIH) β Medications ββββββββββββββββββββββββββββββββββββββββββββββββ | |
| async def fetch_rxcui(client: httpx.AsyncClient, drug_name: str) -> list[dict]: | |
| try: | |
| resp = await client.get(f"{RXNORM_BASE}/drugs.json", params={"name": drug_name}, timeout=15) | |
| resp.raise_for_status() | |
| d = resp.json() | |
| groups = d.get("drugGroup", {}).get("conceptGroup", []) | |
| results = [] | |
| for grp in groups: | |
| tty = grp.get("tty", "") | |
| for concept in grp.get("conceptProperties", [])[:3]: | |
| results.append({ | |
| "rxcui": concept.get("rxcui", ""), | |
| "name": concept.get("name", ""), | |
| "tty": tty, | |
| "search_name": drug_name, | |
| }) | |
| return results[:5] # Top 5 | |
| except Exception as e: | |
| print(f" [rxnorm] error for '{drug_name}': {e}") | |
| return [] | |
| async def seed_medications(client: httpx.AsyncClient) -> int: | |
| print("\n[2/5] Seeding medications from RxNorm...") | |
| total = 0 | |
| for drug_name in SEED_MEDICATIONS: | |
| concepts = await fetch_rxcui(client, drug_name) | |
| for concept in concepts[:1]: # Primary concept only | |
| upsert(""" | |
| MERGE (m:Medication {rxcui: $rxcui}) | |
| SET m += { | |
| name: $name, tty: $tty, generic_name: $search_name, | |
| source: 'rxnorm', updated_at: datetime() | |
| } | |
| """, concept) | |
| total += 1 | |
| print(f" {drug_name}: {len(concepts)} RxCUI concepts") | |
| await asyncio.sleep(0.2) | |
| print(f" Total medications seeded: {total}") | |
| return total | |
| # ββ ICD-10 CM (NLM) β Diagnoses ββββββββββββββββββββββββββββββββββββββββββββββ | |
| async def fetch_icd10(client: httpx.AsyncClient, prefix: str) -> list[dict]: | |
| try: | |
| resp = await client.get(ICD10_BASE, params={ | |
| "sf": "code,name", | |
| "terms": prefix, | |
| "maxList": 20, | |
| }, timeout=15) | |
| resp.raise_for_status() | |
| data = resp.json() | |
| if not data or len(data) < 4: | |
| return [] | |
| return [{"code": item[0], "name": item[1]} for item in data[3]] | |
| except Exception as e: | |
| print(f" [icd10] error for '{prefix}': {e}") | |
| return [] | |
| async def seed_diagnoses(client: httpx.AsyncClient) -> int: | |
| print("\n[3/5] Seeding diagnoses from ICD-10 CM...") | |
| total = 0 | |
| for prefix in SEED_ICD10_PREFIXES: | |
| codes = await fetch_icd10(client, prefix) | |
| for item in codes: | |
| upsert(""" | |
| MERGE (d:Diagnosis {code: $code}) | |
| SET d += {name: $name, source: 'icd10cm', updated_at: datetime()} | |
| """, item) | |
| total += 1 | |
| # Link ICD prefix β condition names for matching | |
| condition_map = { | |
| "C50": "breast cancer", "C61": "prostate cancer", "C34": "non-small cell lung cancer", | |
| "C18": "colorectal cancer", "C56": "ovarian cancer", "C43": "melanoma", | |
| "C91": "leukemia", "C85": "lymphoma", "C71": "glioblastoma", "C25": "pancreatic cancer", | |
| } | |
| if prefix in condition_map: | |
| upsert(""" | |
| MATCH (d:Diagnosis) WHERE d.code STARTS WITH $prefix | |
| MATCH (c:ConditionNode {name: $condition}) | |
| MERGE (d)-[:MAPS_TO_CONDITION]->(c) | |
| """, {"prefix": prefix, "condition": condition_map[prefix]}) | |
| print(f" ICD-10 {prefix}: {len(codes)} codes") | |
| await asyncio.sleep(0.2) | |
| print(f" Total diagnoses seeded: {total}") | |
| return total | |
| # ββ PubMed (NCBI) β Supporting Literature ββββββββββββββββββββββββββββββββββββ | |
| async def fetch_pubmed_ids(client: httpx.AsyncClient, condition: str, count: int = 5) -> list[str]: | |
| try: | |
| resp = await client.get(f"{PUBMED_BASE}/esearch.fcgi", params={ | |
| "db": "pubmed", | |
| "term": f"clinical trial {condition} treatment[Title/Abstract]", | |
| "retmax": count, | |
| "retmode": "json", | |
| "sort": "relevance", | |
| }, timeout=15) | |
| resp.raise_for_status() | |
| return resp.json()["esearchresult"]["idlist"] | |
| except Exception as e: | |
| print(f" [pubmed] error for '{condition}': {e}") | |
| return [] | |
| async def fetch_pubmed_summary(client: httpx.AsyncClient, pmid: str) -> dict | None: | |
| try: | |
| resp = await client.get(f"{PUBMED_BASE}/esummary.fcgi", params={ | |
| "db": "pubmed", "id": pmid, "retmode": "json", | |
| }, timeout=15) | |
| resp.raise_for_status() | |
| result = resp.json()["result"] | |
| if pmid not in result: | |
| return None | |
| r = result[pmid] | |
| return { | |
| "pmid": pmid, | |
| "title": r.get("title", "")[:200], | |
| "source": r.get("source", ""), | |
| "pub_date": r.get("pubdate", ""), | |
| "authors": ", ".join(a.get("name", "") for a in r.get("authors", [])[:3]), | |
| } | |
| except Exception as e: | |
| return None | |
| async def seed_literature(client: httpx.AsyncClient) -> int: | |
| print("\n[4/5] Seeding supporting literature from PubMed...") | |
| total = 0 | |
| for condition in SEED_CONDITIONS[:5]: # Top 5 conditions to keep fast | |
| pmids = await fetch_pubmed_ids(client, condition) | |
| for pmid in pmids: | |
| summary = await fetch_pubmed_summary(client, pmid) | |
| if not summary: | |
| continue | |
| upsert(""" | |
| MERGE (p:Publication {pmid: $pmid}) | |
| SET p += { | |
| title: $title, journal: $source, pub_date: $pub_date, | |
| authors: $authors, source: 'pubmed', updated_at: datetime() | |
| } | |
| WITH p | |
| MATCH (c:ConditionNode {name: $condition}) | |
| MERGE (p)-[:SUPPORTS_RESEARCH_ON]->(c) | |
| """, {**summary, "condition": condition}) | |
| total += 1 | |
| print(f" {condition}: {len(pmids)} publications linked") | |
| await asyncio.sleep(0.3) | |
| print(f" Total publications seeded: {total}") | |
| return total | |
| # ββ Biomarkers (static β curated from COSMIC / NCIT) βββββββββββββββββββββββββ | |
| # Expand seed conditions to 20 oncology types | |
| SEED_CONDITIONS = [ | |
| "breast cancer", "prostate cancer", "non-small cell lung cancer", "colorectal cancer", | |
| "ovarian cancer", "melanoma", "leukemia", "lymphoma", "glioblastoma", "pancreatic cancer", | |
| "bladder cancer", "renal cell carcinoma", "thyroid cancer", "multiple myeloma", | |
| "endometrial cancer", "cervical cancer", "gastric cancer", "hepatocellular carcinoma", | |
| "head and neck cancer", "sarcoma", | |
| ] | |
| CURATED_BIOMARKERS = [ | |
| # Breast cancer | |
| {"id": "HER2_POS", "name": "HER2 Positive", "gene": "ERBB2", "loinc": "85319-2", "condition": "breast cancer"}, | |
| {"id": "HER2_NEG", "name": "HER2 Negative", "gene": "ERBB2", "loinc": "85319-2", "condition": "breast cancer"}, | |
| {"id": "BRCA1_MUT", "name": "BRCA1 Pathogenic Variant", "gene": "BRCA1", "loinc": "21636-6", "condition": "breast cancer"}, | |
| {"id": "BRCA2_MUT", "name": "BRCA2 Pathogenic Variant", "gene": "BRCA2", "loinc": "21637-4", "condition": "breast cancer"}, | |
| {"id": "PIK3CA_MUT", "name": "PIK3CA Mutation", "gene": "PIK3CA", "loinc": "82457-4", "condition": "breast cancer"}, | |
| {"id": "TP53_MUT", "name": "TP53 Mutation", "gene": "TP53", "loinc": "21637-4", "condition": "breast cancer"}, | |
| {"id": "ER_POS", "name": "Estrogen Receptor Positive", "gene": "ESR1", "loinc": "85310-1", "condition": "breast cancer"}, | |
| {"id": "PR_POS", "name": "Progesterone Receptor Positive", "gene": "PGR", "loinc": "85321-8", "condition": "breast cancer"}, | |
| {"id": "TNBC", "name": "Triple Negative Breast Cancer", "gene": "ERBB2/ESR1/PGR", "loinc": "85319-2", "condition": "breast cancer"}, | |
| # Lung | |
| {"id": "EGFR_L858R", "name": "EGFR L858R Mutation", "gene": "EGFR", "loinc": "81704-9", "condition": "non-small cell lung cancer"}, | |
| {"id": "EGFR_DEL19", "name": "EGFR Exon 19 Deletion", "gene": "EGFR", "loinc": "81704-9", "condition": "non-small cell lung cancer"}, | |
| {"id": "EGFR_T790M", "name": "EGFR T790M Resistance Mutation", "gene": "EGFR", "loinc": "81704-9", "condition": "non-small cell lung cancer"}, | |
| {"id": "ALK_FUSION", "name": "ALK Gene Fusion", "gene": "ALK", "loinc": "81695-9", "condition": "non-small cell lung cancer"}, | |
| {"id": "ROS1_FUSION", "name": "ROS1 Gene Fusion", "gene": "ROS1", "loinc": "81696-7", "condition": "non-small cell lung cancer"}, | |
| {"id": "MET_EX14", "name": "MET Exon 14 Skipping", "gene": "MET", "loinc": "82139-8", "condition": "non-small cell lung cancer"}, | |
| {"id": "KRAS_G12C", "name": "KRAS G12C Mutation", "gene": "KRAS", "loinc": "81434-5", "condition": "non-small cell lung cancer"}, | |
| {"id": "PDL1_HIGH", "name": "PD-L1 TPS β₯50%", "gene": "CD274", "loinc": "73977-1", "condition": "non-small cell lung cancer"}, | |
| {"id": "PDL1_LOW", "name": "PD-L1 TPS 1-49%", "gene": "CD274", "loinc": "73977-1", "condition": "non-small cell lung cancer"}, | |
| {"id": "PDL1_NEG", "name": "PD-L1 TPS <1%", "gene": "CD274", "loinc": "73977-1", "condition": "non-small cell lung cancer"}, | |
| # Prostate | |
| {"id": "PSA_ELEVATED","name": "PSA Elevated (>4 ng/mL)", "gene": "KLK3", "loinc": "2857-1", "condition": "prostate cancer"}, | |
| {"id": "PTEN_LOSS", "name": "PTEN Loss", "gene": "PTEN", "loinc": "21637-4", "condition": "prostate cancer"}, | |
| {"id": "AR_V7", "name": "Androgen Receptor Splice Variant 7", "gene": "AR", "loinc": "82145-5", "condition": "prostate cancer"}, | |
| # Colorectal | |
| {"id": "MSI_H", "name": "Microsatellite Instability-High", "gene": "MLH1/MSH2", "loinc": "85077-6", "condition": "colorectal cancer"}, | |
| {"id": "MSS", "name": "Microsatellite Stable", "gene": "MLH1/MSH2", "loinc": "85077-6", "condition": "colorectal cancer"}, | |
| {"id": "KRAS_WT", "name": "KRAS Wild-Type", "gene": "KRAS", "loinc": "21637-4", "condition": "colorectal cancer"}, | |
| {"id": "BRAF_V600E", "name": "BRAF V600E Mutation", "gene": "BRAF", "loinc": "81287-7", "condition": "colorectal cancer"}, | |
| {"id": "NRAS_MUT", "name": "NRAS Mutation", "gene": "NRAS", "loinc": "82143-0", "condition": "colorectal cancer"}, | |
| # Melanoma | |
| {"id": "BRAF_V600K", "name": "BRAF V600K Mutation", "gene": "BRAF", "loinc": "81287-7", "condition": "melanoma"}, | |
| {"id": "TMB_HIGH", "name": "Tumor Mutational Burden High (β₯10)", "gene": "TMB", "loinc": "94076-7", "condition": "melanoma"}, | |
| {"id": "NRAS_MEL", "name": "NRAS Mutation (Melanoma)", "gene": "NRAS", "loinc": "82143-0", "condition": "melanoma"}, | |
| # GBM | |
| {"id": "IDH1_R132H", "name": "IDH1 R132H Mutation", "gene": "IDH1", "loinc": "82140-6", "condition": "glioblastoma"}, | |
| {"id": "IDH1_WT", "name": "IDH1 Wild-Type", "gene": "IDH1", "loinc": "82140-6", "condition": "glioblastoma"}, | |
| {"id": "MGMT_METH", "name": "MGMT Promoter Methylation", "gene": "MGMT", "loinc": "85319-2", "condition": "glioblastoma"}, | |
| {"id": "EGFR_AMP", "name": "EGFR Amplification", "gene": "EGFR", "loinc": "81704-9", "condition": "glioblastoma"}, | |
| # Leukemia / Lymphoma | |
| {"id": "BCR_ABL1", "name": "BCR-ABL1 Fusion (Philadelphia Chr)", "gene": "BCR/ABL1", "loinc": "33899-6", "condition": "leukemia"}, | |
| {"id": "FLT3_ITD", "name": "FLT3 Internal Tandem Duplication", "gene": "FLT3", "loinc": "82144-8", "condition": "leukemia"}, | |
| {"id": "NPM1_MUT", "name": "NPM1 Mutation", "gene": "NPM1", "loinc": "82147-1", "condition": "leukemia"}, | |
| {"id": "CD20_POS", "name": "CD20 Positive", "gene": "MS4A1", "loinc": "85080-0", "condition": "lymphoma"}, | |
| {"id": "EZH2_MUT", "name": "EZH2 Mutation", "gene": "EZH2", "loinc": "82148-9", "condition": "lymphoma"}, | |
| # New conditions | |
| {"id": "FGFR3_MUT", "name": "FGFR3 Mutation", "gene": "FGFR3", "loinc": "82150-5", "condition": "bladder cancer"}, | |
| {"id": "VHL_LOSS", "name": "VHL Gene Loss", "gene": "VHL", "loinc": "82151-3", "condition": "renal cell carcinoma"}, | |
| {"id": "MTOR_MUT", "name": "mTOR Pathway Mutation", "gene": "MTOR", "loinc": "82152-1", "condition": "renal cell carcinoma"}, | |
| {"id": "BRAF_THYROID","name": "BRAF V600E (Thyroid)", "gene": "BRAF", "loinc": "81287-7", "condition": "thyroid cancer"}, | |
| {"id": "RET_FUSION", "name": "RET Gene Fusion", "gene": "RET", "loinc": "82153-9", "condition": "thyroid cancer"}, | |
| {"id": "NTRK_FUSION", "name": "NTRK Gene Fusion", "gene": "NTRK1/2/3", "loinc": "82154-7", "condition": "thyroid cancer"}, | |
| {"id": "WHSC1_MUT", "name": "MMSET/WHSC1 Mutation", "gene": "NSD2", "loinc": "82155-4", "condition": "multiple myeloma"}, | |
| {"id": "CDKN2A_LOSS", "name": "CDKN2A Loss", "gene": "CDKN2A", "loinc": "82156-2", "condition": "multiple myeloma"}, | |
| {"id": "POLE_MUT", "name": "POLE Mutation", "gene": "POLE", "loinc": "82157-0", "condition": "endometrial cancer"}, | |
| {"id": "CTNNB1_MUT", "name": "CTNNB1 Mutation", "gene": "CTNNB1", "loinc": "82158-8", "condition": "endometrial cancer"}, | |
| {"id": "HPV_POS", "name": "HPV Positive", "gene": "HPV", "loinc": "21440-3", "condition": "cervical cancer"}, | |
| {"id": "ERBB2_GC", "name": "HER2 Amplification (Gastric)", "gene": "ERBB2", "loinc": "85319-2", "condition": "gastric cancer"}, | |
| {"id": "HBV_POS", "name": "Hepatitis B Virus Positive", "gene": "HBV", "loinc": "16933-4", "condition": "hepatocellular carcinoma"}, | |
| {"id": "TERT_MUT", "name": "TERT Promoter Mutation", "gene": "TERT", "loinc": "82159-6", "condition": "hepatocellular carcinoma"}, | |
| {"id": "PIK3CA_HNC", "name": "PIK3CA Mutation (H&N)", "gene": "PIK3CA", "loinc": "82457-4", "condition": "head and neck cancer"}, | |
| {"id": "HPV_HNSC", "name": "HPV-Positive HNSCC", "gene": "HPV", "loinc": "21440-3", "condition": "head and neck cancer"}, | |
| {"id": "CDK4_AMP", "name": "CDK4 Amplification", "gene": "CDK4", "loinc": "82160-4", "condition": "sarcoma"}, | |
| {"id": "MDM2_AMP", "name": "MDM2 Amplification", "gene": "MDM2", "loinc": "82161-2", "condition": "sarcoma"}, | |
| ] | |
| def seed_biomarkers() -> int: | |
| print("\n[5/5] Seeding biomarkers (curated from COSMIC/NCIT)...") | |
| for bm in CURATED_BIOMARKERS: | |
| upsert(""" | |
| MERGE (b:Biomarker {id: $id}) | |
| SET b += {name: $name, gene: $gene, loinc: $loinc, source: 'curated', updated_at: datetime()} | |
| WITH b | |
| MERGE (c:ConditionNode {name: $condition}) | |
| MERGE (b)-[:RELEVANT_TO]->(c) | |
| """, bm) | |
| print(f" {len(CURATED_BIOMARKERS)} biomarkers seeded and linked to conditions") | |
| return len(CURATED_BIOMARKERS) | |
| # ββ Eligibility relationships βββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def derive_eligibility_relationships(): | |
| print("\n[+] Deriving eligibility relationships...") | |
| upsert("MATCH (d:Diagnosis)-[:MAPS_TO_CONDITION]->(c:ConditionNode)-[:HAS_TRIAL]->(t:Trial) MERGE (d)-[:ELIGIBLE_FOR]->(t)") | |
| upsert("MATCH (b:Biomarker)-[:RELEVANT_TO]->(c:ConditionNode)-[:HAS_TRIAL]->(t:Trial) MERGE (b)-[:MAY_QUALIFY_FOR]->(t)") | |
| print(" Eligibility relationships derived.") | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Synthetic Patient Engine β 100 k clinically-informed personas | |
| # Distributions based on: SEER 2023, TCGA biomarker atlas, ASCO guidelines, | |
| # US Census 2020 demographics, ACS Cancer Facts & Figures 2024. | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # ββ Name pools (US Census racial/ethnic proportions) βββββββββββββββββββββββββ | |
| _NAMES_F_WHITE = ["Emma","Olivia","Ava","Isabella","Sophia","Charlotte","Amelia","Mia","Harper", | |
| "Evelyn","Abigail","Emily","Elizabeth","Avery","Ella","Madison","Scarlett", | |
| "Victoria","Grace","Chloe","Penelope","Riley","Lily","Eleanor","Hannah", | |
| "Lillian","Addison","Aubrey","Ellie","Stella","Natalie","Leah","Hazel", | |
| "Violet","Audrey","Claire","Lucy","Anna","Samantha","Katherine"] | |
| _NAMES_F_BLACK = ["Aaliyah","Amara","Destiny","Imani","Jasmine","Keisha","Layla","Maya","Naomi", | |
| "Nia","Raven","Serena","Tamara","Unique","Zora","Aisha","Brianna","Crystal", | |
| "Diamond","Essence","Faith","Genesis","Heaven","India","Jade","Kiara","Lashonda", | |
| "Monique","Nadia","Precious","Quiana","Regina","Shanice","Tiffany","Whitney"] | |
| _NAMES_F_HISPANIC = ["Sofia","Camila","Valentina","Isabella","Daniela","Fernanda","Gabriela","Lucia", | |
| "Maria","Ana","Carmen","Diana","Elena","Gloria","Iris","Jessica","Laura", | |
| "Linda","Margarita","Natalia","Paola","Rosa","Sandra","Teresa","Veronica", | |
| "Ximena","Yolanda","Adriana","Beatriz","Carolina","Esperanza","Francisca"] | |
| _NAMES_F_ASIAN = ["Aiko","Mei","Yuki","Sakura","Hana","Yuna","Ji-Young","Soo-Jin","Lan","Linh", | |
| "Nguyen","Phuong","Priya","Divya","Ananya","Kavya","Shreya","Sanjana", | |
| "Hui","Xin","Ying","Fang","Jing","Li","Min","Qian","Wei","Xue","Yan","Zhen"] | |
| _NAMES_M_WHITE = ["Liam","Noah","William","James","Oliver","Benjamin","Elijah","Lucas","Mason", | |
| "Logan","Alexander","Ethan","Jacob","Michael","Daniel","Henry","Jackson", | |
| "Sebastian","Aiden","Matthew","Samuel","David","Joseph","Carter","Owen", | |
| "Wyatt","John","Jack","Luke","Dylan","Grayson","Levi","Isaac","Gabriel"] | |
| _NAMES_M_BLACK = ["Andre","DeShawn","Darius","Elijah","Isaiah","Jamal","Jaylen","Jordan","Kendrick", | |
| "Malik","Marcus","Marquise","Nathaniel","Omari","Quincy","Rashad","Roderick", | |
| "Terrence","Trevon","Xavier","Zion","Aaron","Calvin","Damon","Ernest","Frederick", | |
| "Gerald","Harold","Ivan","Jerome","Kenneth","Leonard","Maurice","Nelson"] | |
| _NAMES_M_HISPANIC = ["Santiago","Mateo","Alejandro","Sebastian","Diego","Carlos","Miguel","Andres", | |
| "Fernando","Jose","Luis","Manuel","Marco","Mario","Pablo","Rafael","Ricardo", | |
| "Roberto","Rodrigo","Victor","Alberto","Arturo","Cesar","Eduardo","Ernesto", | |
| "Francisco","Guillermo","Hector","Ignacio","Javier","Juan","Lorenzo","Oscar"] | |
| _NAMES_M_ASIAN = ["Wei","Ming","Jian","Yang","Hao","Lei","Tao","Xiao","Yong","Jun","Ryu","Kenji", | |
| "Hiroshi","Takashi","Yuto","Min-Jun","Seo-Jun","Ji-Ho","Arjun","Rahul","Vikram", | |
| "Suresh","Rajesh","Anil","Vijay","Amit","Nikhil","Rohan","Kiran","Sanjay"] | |
| _LAST_NAMES_WHITE = ["Smith","Johnson","Williams","Brown","Jones","Miller","Davis","Wilson","Anderson", | |
| "Thomas","Taylor","Moore","Jackson","Martin","Lee","Thompson","White","Harris", | |
| "Clark","Lewis","Robinson","Walker","Young","Allen","King","Wright","Scott", | |
| "Green","Adams","Nelson","Baker","Hall","Campbell","Mitchell","Carter","Roberts"] | |
| _LAST_NAMES_BLACK = ["Williams","Johnson","Jones","Brown","Davis","Wilson","Thomas","Taylor","Moore", | |
| "Jackson","Harris","Thompson","White","Robinson","Walker","King","Green","Adams", | |
| "Baker","Hall","Carter","Mitchell","Peele","Banks","Bell","Boyd","Brooks","Bryant", | |
| "Byrd","Chambers","Coleman","Collins","Cooper","Crawford","Dixon","Edwards"] | |
| _LAST_NAMES_HISPANIC = ["Garcia","Rodriguez","Martinez","Hernandez","Lopez","Gonzalez","Perez","Sanchez", | |
| "Ramirez","Torres","Flores","Rivera","Gomez","Diaz","Reyes","Morales","Cruz", | |
| "Gutierrez","Ortiz","Chavez","Ramos","Romero","Vargas","Castillo","Jimenez", | |
| "Moreno","Alvarez","Mendoza","Ruiz","Aguilar","Vega","Castro","Medina"] | |
| _LAST_NAMES_ASIAN = ["Wang","Li","Zhang","Liu","Chen","Yang","Huang","Zhao","Wu","Zhou","Kim","Park", | |
| "Lee","Choi","Jung","Nguyen","Tran","Le","Pham","Hoang","Patel","Shah","Kumar", | |
| "Singh","Sharma","Gupta","Mehta","Kapoor","Nair","Reddy","Iyer","Rao","Joshi"] | |
| # Ethnic distribution approximating US cancer patient demographics (ACS 2024) | |
| _ETHNICITY_GROUPS = [ | |
| ("White", 0.60, _NAMES_F_WHITE, _NAMES_M_WHITE, _LAST_NAMES_WHITE), | |
| ("Black or African American", 0.13, _NAMES_F_BLACK, _NAMES_M_BLACK, _LAST_NAMES_BLACK), | |
| ("Hispanic or Latino", 0.14, _NAMES_F_HISPANIC, _NAMES_M_HISPANIC, _LAST_NAMES_HISPANIC), | |
| ("Asian", 0.07, _NAMES_F_ASIAN, _NAMES_M_ASIAN, _LAST_NAMES_ASIAN), | |
| ("American Indian or Alaska Native", 0.03, _NAMES_F_WHITE, _NAMES_M_WHITE, _LAST_NAMES_WHITE), | |
| ("Native Hawaiian or Pacific Islander", 0.01, _NAMES_F_ASIAN, _NAMES_M_ASIAN, _LAST_NAMES_ASIAN), | |
| ("Other / Multiracial", 0.02, _NAMES_F_WHITE, _NAMES_M_WHITE, _LAST_NAMES_WHITE), | |
| ] | |
| _ETH_NAMES = [(e[0], e[2], e[3], e[4]) for e in _ETHNICITY_GROUPS] | |
| _ETH_WEIGHTS = [e[1] for e in _ETHNICITY_GROUPS] | |
| # City pool weighted by US metropolitan population (2020 Census) | |
| _CITIES = [ | |
| ("New York","NY",0.060),("Los Angeles","CA",0.045),("Chicago","IL",0.033), | |
| ("Houston","TX",0.027),("Phoenix","AZ",0.020),("Philadelphia","PA",0.018), | |
| ("San Antonio","TX",0.016),("San Diego","CA",0.016),("Dallas","TX",0.015), | |
| ("San Jose","CA",0.013),("Austin","TX",0.013),("Jacksonville","FL",0.011), | |
| ("Fort Worth","TX",0.010),("Columbus","OH",0.010),("Charlotte","NC",0.010), | |
| ("Indianapolis","IN",0.009),("San Francisco","CA",0.009),("Seattle","WA",0.009), | |
| ("Denver","CO",0.009),("Nashville","TN",0.009),("Boston","MA",0.009), | |
| ("Baltimore","MD",0.008),("Louisville","KY",0.007),("Portland","OR",0.007), | |
| ("Las Vegas","NV",0.007),("Milwaukee","WI",0.006),("Albuquerque","NM",0.006), | |
| ("Tucson","AZ",0.006),("Fresno","CA",0.005),("Sacramento","CA",0.005), | |
| ("Atlanta","GA",0.009),("Kansas City","MO",0.005),("Omaha","NE",0.004), | |
| ("Raleigh","NC",0.005),("Cleveland","OH",0.005),("Minneapolis","MN",0.006), | |
| ("Miami","FL",0.008),("Tampa","FL",0.007),("New Orleans","LA",0.005), | |
| ("Pittsburgh","PA",0.006),("Memphis","TN",0.005),("Richmond","VA",0.004), | |
| ("Birmingham","AL",0.004),("Salt Lake City","UT",0.004),("Hartford","CT",0.004), | |
| ("Buffalo","NY",0.004),("Rochester","NY",0.003),("Providence","RI",0.003), | |
| ("Des Moines","IA",0.003),("Little Rock","AR",0.003),("Madison","WI",0.003), | |
| ] | |
| _CITY_NAMES = [(c[0], c[1]) for c in _CITIES] | |
| _CITY_WEIGHTS = [c[2] for c in _CITIES] | |
| # Comorbidity prevalence in US oncology patients (literature-based) | |
| _COMORBIDITY_POOL = [ | |
| ("Type 2 Diabetes", 0.18), | |
| ("Hypertension", 0.42), | |
| ("Coronary Artery Disease",0.09), | |
| ("COPD", 0.08), | |
| ("Chronic Kidney Disease", 0.12), | |
| ("Obesity (BMI>30)", 0.36), | |
| ("Depression/Anxiety", 0.22), | |
| ("Hypothyroidism", 0.07), | |
| ("Atrial Fibrillation", 0.05), | |
| ("Osteoporosis", 0.06), | |
| ] | |
| # Insurance status (US cancer patient distribution, KFF 2023) | |
| _INSURANCE = [ | |
| ("Private/Employer", 0.48), | |
| ("Medicare", 0.30), | |
| ("Medicaid", 0.14), | |
| ("Uninsured", 0.05), | |
| ("VA/Military", 0.03), | |
| ] | |
| _INS_LABELS = [i[0] for i in _INSURANCE] | |
| _INS_WEIGHTS = [i[1] for i in _INSURANCE] | |
| # ECOG score distribution varies by condition severity | |
| _ECOG_BY_CONDITION: dict[str, list[float]] = { | |
| # [P(0), P(1), P(2), P(3)] | |
| "breast cancer": [0.35, 0.40, 0.18, 0.07], | |
| "prostate cancer": [0.30, 0.40, 0.20, 0.10], | |
| "non-small cell lung cancer": [0.20, 0.38, 0.28, 0.14], | |
| "colorectal cancer": [0.28, 0.40, 0.22, 0.10], | |
| "ovarian cancer": [0.25, 0.40, 0.25, 0.10], | |
| "melanoma": [0.40, 0.38, 0.15, 0.07], | |
| "leukemia": [0.25, 0.38, 0.25, 0.12], | |
| "lymphoma": [0.28, 0.40, 0.22, 0.10], | |
| "glioblastoma": [0.15, 0.35, 0.30, 0.20], | |
| "pancreatic cancer": [0.15, 0.32, 0.33, 0.20], | |
| "bladder cancer": [0.28, 0.40, 0.22, 0.10], | |
| "renal cell carcinoma": [0.32, 0.40, 0.20, 0.08], | |
| "thyroid cancer": [0.50, 0.35, 0.12, 0.03], | |
| "multiple myeloma": [0.22, 0.38, 0.28, 0.12], | |
| "endometrial cancer": [0.30, 0.40, 0.22, 0.08], | |
| "cervical cancer": [0.25, 0.40, 0.25, 0.10], | |
| "gastric cancer": [0.18, 0.35, 0.30, 0.17], | |
| "hepatocellular carcinoma": [0.15, 0.32, 0.33, 0.20], | |
| "head and neck cancer": [0.20, 0.38, 0.28, 0.14], | |
| "sarcoma": [0.30, 0.40, 0.22, 0.08], | |
| } | |
| # ββ Condition profiles (SEER-weighted) βββββββββββββββββββββββββββββββββββββββ | |
| # count_weight β how many of the 100 k total patients come from this condition | |
| # biomarker_prevalences β {biomarker_id: probability} (TCGA / literature) | |
| _CONDITION_PROFILES: dict[str, dict] = { | |
| "breast cancer": { | |
| "icd10_prefix": "C50", "sex": "FEMALE", "count_weight": 0.155, | |
| "age_range": (25, 82), "age_mode": 62, | |
| "stages": ["I","II","III","IV"], "stage_weights": [0.28, 0.32, 0.25, 0.15], | |
| "biomarker_prevalences": { | |
| "ER_POS":0.75,"PR_POS":0.65,"HER2_POS":0.17,"HER2_NEG":0.83, | |
| "TNBC":0.12,"BRCA1_MUT":0.05,"BRCA2_MUT":0.04, | |
| "PIK3CA_MUT":0.35,"TP53_MUT":0.28, | |
| }, | |
| "med_pool": ["trastuzumab","bevacizumab","capecitabine","olaparib","pembrolizumab"], | |
| "prior_chemo_rate": 0.65, | |
| }, | |
| "non-small cell lung cancer": { | |
| "icd10_prefix": "C34", "sex": "ALL", "count_weight": 0.130, | |
| "age_range": (40, 84), "age_mode": 68, | |
| "stages": ["I","II","III","IV"], "stage_weights": [0.09, 0.12, 0.28, 0.51], | |
| "biomarker_prevalences": { | |
| "EGFR_L858R":0.08,"EGFR_DEL19":0.09,"EGFR_T790M":0.05, | |
| "ALK_FUSION":0.04,"ROS1_FUSION":0.02,"MET_EX14":0.03, | |
| "KRAS_G12C":0.13,"PDL1_HIGH":0.28,"PDL1_LOW":0.30,"PDL1_NEG":0.42, | |
| }, | |
| "med_pool": ["osimertinib","pembrolizumab","nivolumab","erlotinib","atezolizumab","durvalumab"], | |
| "prior_chemo_rate": 0.55, | |
| }, | |
| "prostate cancer": { | |
| "icd10_prefix": "C61", "sex": "MALE", "count_weight": 0.095, | |
| "age_range": (45, 86), "age_mode": 67, | |
| "stages": ["I","II","III","IV"], "stage_weights": [0.18, 0.28, 0.28, 0.26], | |
| "biomarker_prevalences": { | |
| "PSA_ELEVATED":0.90,"BRCA2_MUT":0.05,"PTEN_LOSS":0.25,"AR_V7":0.20, | |
| }, | |
| "med_pool": ["enzalutamide","bevacizumab","olaparib","pembrolizumab"], | |
| "prior_chemo_rate": 0.40, | |
| }, | |
| "colorectal cancer": { | |
| "icd10_prefix": "C18", "sex": "ALL", "count_weight": 0.085, | |
| "age_range": (35, 82), "age_mode": 65, | |
| "stages": ["I","II","III","IV"], "stage_weights": [0.18, 0.26, 0.30, 0.26], | |
| "biomarker_prevalences": { | |
| "MSI_H":0.10,"MSS":0.90,"KRAS_WT":0.42, | |
| "BRAF_V600E":0.08,"NRAS_MUT":0.05,"KRAS_G12C":0.04, | |
| }, | |
| "med_pool": ["bevacizumab","cetuximab","capecitabine","pembrolizumab"], | |
| "prior_chemo_rate": 0.60, | |
| }, | |
| "melanoma": { | |
| "icd10_prefix": "C43", "sex": "ALL", "count_weight": 0.055, | |
| "age_range": (20, 80), "age_mode": 57, | |
| "stages": ["I","II","III","IV"], "stage_weights": [0.30, 0.28, 0.22, 0.20], | |
| "biomarker_prevalences": { | |
| "BRAF_V600E":0.45,"BRAF_V600K":0.06,"TMB_HIGH":0.35,"NRAS_MEL":0.20, | |
| }, | |
| "med_pool": ["pembrolizumab","nivolumab","dabrafenib","vemurafenib","ipilimumab"], | |
| "prior_chemo_rate": 0.30, | |
| }, | |
| "bladder cancer": { | |
| "icd10_prefix": "C67", "sex": "ALL", "count_weight": 0.045, | |
| "age_range": (45, 85), "age_mode": 69, | |
| "stages": ["I","II","III","IV"], "stage_weights": [0.28, 0.24, 0.26, 0.22], | |
| "biomarker_prevalences": { | |
| "FGFR3_MUT":0.20,"PDL1_HIGH":0.22,"TMB_HIGH":0.15,"TP53_MUT":0.30, | |
| }, | |
| "med_pool": ["pembrolizumab","atezolizumab","nivolumab","erdafitinib"], | |
| "prior_chemo_rate": 0.45, | |
| }, | |
| "renal cell carcinoma": { | |
| "icd10_prefix": "C64", "sex": "ALL", "count_weight": 0.042, | |
| "age_range": (40, 82), "age_mode": 64, | |
| "stages": ["I","II","III","IV"], "stage_weights": [0.25, 0.20, 0.25, 0.30], | |
| "biomarker_prevalences": { | |
| "VHL_LOSS":0.55,"MTOR_MUT":0.15,"PDL1_HIGH":0.18, | |
| }, | |
| "med_pool": ["pembrolizumab","nivolumab","bevacizumab","sunitinib"], | |
| "prior_chemo_rate": 0.25, | |
| }, | |
| "lymphoma": { | |
| "icd10_prefix": "C85", "sex": "ALL", "count_weight": 0.042, | |
| "age_range": (20, 80), "age_mode": 58, | |
| "stages": ["I","II","III","IV"], "stage_weights": [0.20, 0.25, 0.30, 0.25], | |
| "biomarker_prevalences": { | |
| "CD20_POS":0.85,"EZH2_MUT":0.22,"TMB_HIGH":0.12,"PDL1_HIGH":0.15, | |
| }, | |
| "med_pool": ["rituximab","pembrolizumab","nivolumab"], | |
| "prior_chemo_rate": 0.55, | |
| }, | |
| "endometrial cancer": { | |
| "icd10_prefix": "C54", "sex": "FEMALE", "count_weight": 0.038, | |
| "age_range": (40, 82), "age_mode": 63, | |
| "stages": ["I","II","III","IV"], "stage_weights": [0.50, 0.15, 0.20, 0.15], | |
| "biomarker_prevalences": { | |
| "MSI_H":0.25,"POLE_MUT":0.07,"CTNNB1_MUT":0.30,"TP53_MUT":0.25,"PIK3CA_MUT":0.35, | |
| }, | |
| "med_pool": ["pembrolizumab","bevacizumab","olaparib","capecitabine"], | |
| "prior_chemo_rate": 0.40, | |
| }, | |
| "leukemia": { | |
| "icd10_prefix": "C91", "sex": "ALL", "count_weight": 0.035, | |
| "age_range": (18, 82), "age_mode": 55, | |
| "stages": ["I","II","III","IV"], "stage_weights": [0.25, 0.25, 0.28, 0.22], | |
| "biomarker_prevalences": { | |
| "BCR_ABL1":0.30,"FLT3_ITD":0.25,"NPM1_MUT":0.30,"TP53_MUT":0.15, | |
| }, | |
| "med_pool": ["imatinib","rituximab","pembrolizumab"], | |
| "prior_chemo_rate": 0.60, | |
| }, | |
| "pancreatic cancer": { | |
| "icd10_prefix": "C25", "sex": "ALL", "count_weight": 0.033, | |
| "age_range": (40, 82), "age_mode": 68, | |
| "stages": ["I","II","III","IV"], "stage_weights": [0.05, 0.12, 0.28, 0.55], | |
| "biomarker_prevalences": { | |
| "KRAS_G12C":0.07,"BRCA2_MUT":0.06,"TP53_MUT":0.55,"MSI_H":0.02, | |
| }, | |
| "med_pool": ["capecitabine","erlotinib","olaparib"], | |
| "prior_chemo_rate": 0.50, | |
| }, | |
| "thyroid cancer": { | |
| "icd10_prefix": "C73", "sex": "FEMALE", "count_weight": 0.030, | |
| "age_range": (20, 75), "age_mode": 47, | |
| "stages": ["I","II","III","IV"], "stage_weights": [0.55, 0.20, 0.15, 0.10], | |
| "biomarker_prevalences": { | |
| "BRAF_THYROID":0.45,"RET_FUSION":0.08,"NTRK_FUSION":0.05, | |
| }, | |
| "med_pool": ["pembrolizumab","dabrafenib","vemurafenib"], | |
| "prior_chemo_rate": 0.15, | |
| }, | |
| "multiple myeloma": { | |
| "icd10_prefix": "C90", "sex": "ALL", "count_weight": 0.025, | |
| "age_range": (45, 84), "age_mode": 67, | |
| "stages": ["I","II","III","IV"], "stage_weights": [0.20, 0.28, 0.30, 0.22], | |
| "biomarker_prevalences": { | |
| "WHSC1_MUT":0.20,"CDKN2A_LOSS":0.30,"TP53_MUT":0.15, | |
| }, | |
| "med_pool": ["pembrolizumab","rituximab","bevacizumab"], | |
| "prior_chemo_rate": 0.65, | |
| }, | |
| "gastric cancer": { | |
| "icd10_prefix": "C16", "sex": "ALL", "count_weight": 0.018, | |
| "age_range": (35, 82), "age_mode": 65, | |
| "stages": ["I","II","III","IV"], "stage_weights": [0.10, 0.20, 0.35, 0.35], | |
| "biomarker_prevalences": { | |
| "ERBB2_GC":0.15,"MSI_H":0.10,"PDL1_HIGH":0.20,"TP53_MUT":0.40, | |
| }, | |
| "med_pool": ["trastuzumab","pembrolizumab","nivolumab","capecitabine"], | |
| "prior_chemo_rate": 0.55, | |
| }, | |
| "ovarian cancer": { | |
| "icd10_prefix": "C56", "sex": "FEMALE", "count_weight": 0.018, | |
| "age_range": (35, 80), "age_mode": 62, | |
| "stages": ["I","II","III","IV"], "stage_weights": [0.12, 0.14, 0.40, 0.34], | |
| "biomarker_prevalences": { | |
| "BRCA1_MUT":0.12,"BRCA2_MUT":0.08,"TP53_MUT":0.60,"PIK3CA_MUT":0.08, | |
| }, | |
| "med_pool": ["olaparib","bevacizumab","pembrolizumab"], | |
| "prior_chemo_rate": 0.75, | |
| }, | |
| "hepatocellular carcinoma": { | |
| "icd10_prefix": "C22", "sex": "ALL", "count_weight": 0.015, | |
| "age_range": (35, 80), "age_mode": 62, | |
| "stages": ["I","II","III","IV"], "stage_weights": [0.10, 0.18, 0.32, 0.40], | |
| "biomarker_prevalences": { | |
| "HBV_POS":0.25,"TERT_MUT":0.55,"TP53_MUT":0.20,"CTNNB1_MUT":0.25, | |
| }, | |
| "med_pool": ["pembrolizumab","nivolumab","bevacizumab","atezolizumab"], | |
| "prior_chemo_rate": 0.35, | |
| }, | |
| "glioblastoma": { | |
| "icd10_prefix": "C71", "sex": "ALL", "count_weight": 0.012, | |
| "age_range": (30, 76), "age_mode": 62, | |
| "stages": ["III","IV"], "stage_weights": [0.28, 0.72], | |
| "biomarker_prevalences": { | |
| "IDH1_WT":0.90,"IDH1_R132H":0.10,"MGMT_METH":0.45, | |
| "EGFR_AMP":0.40,"TP53_MUT":0.25, | |
| }, | |
| "med_pool": ["bevacizumab","pembrolizumab"], | |
| "prior_chemo_rate": 0.70, | |
| }, | |
| "head and neck cancer": { | |
| "icd10_prefix": "C10", "sex": "ALL", "count_weight": 0.012, | |
| "age_range": (30, 80), "age_mode": 60, | |
| "stages": ["I","II","III","IV"], "stage_weights": [0.10, 0.15, 0.30, 0.45], | |
| "biomarker_prevalences": { | |
| "HPV_HNSC":0.60,"PIK3CA_HNC":0.25,"PDL1_HIGH":0.20,"TP53_MUT":0.45, | |
| }, | |
| "med_pool": ["pembrolizumab","nivolumab","cetuximab"], | |
| "prior_chemo_rate": 0.55, | |
| }, | |
| "cervical cancer": { | |
| "icd10_prefix": "C53", "sex": "FEMALE", "count_weight": 0.008, | |
| "age_range": (20, 72), "age_mode": 48, | |
| "stages": ["I","II","III","IV"], "stage_weights": [0.28, 0.25, 0.25, 0.22], | |
| "biomarker_prevalences": { | |
| "HPV_POS":0.99,"PDL1_HIGH":0.25,"PIK3CA_MUT":0.25, | |
| }, | |
| "med_pool": ["pembrolizumab","bevacizumab","nivolumab"], | |
| "prior_chemo_rate": 0.50, | |
| }, | |
| "sarcoma": { | |
| "icd10_prefix": "C49", "sex": "ALL", "count_weight": 0.007, | |
| "age_range": (15, 75), "age_mode": 45, | |
| "stages": ["I","II","III","IV"], "stage_weights": [0.20, 0.25, 0.30, 0.25], | |
| "biomarker_prevalences": { | |
| "CDK4_AMP":0.20,"MDM2_AMP":0.18,"TP53_MUT":0.25, | |
| }, | |
| "med_pool": ["pembrolizumab","nivolumab","bevacizumab"], | |
| "prior_chemo_rate": 0.45, | |
| }, | |
| } | |
| random.seed(42) # reproducible synthetic data | |
| def _parse_age(age_str: str) -> int | None: | |
| if not age_str: | |
| return None | |
| try: | |
| return int(age_str.split()[0]) | |
| except Exception: | |
| return None | |
| def _skewed_age(age_range: tuple[int, int], mode: int) -> int: | |
| """Triangle-distributed age reflecting real incidence peak.""" | |
| lo, hi = age_range | |
| mode = max(lo, min(hi, mode)) | |
| return int(random.triangular(lo, hi, mode)) | |
| def _pick_biomarkers(prevalences: dict[str, float], rng: random.Random) -> list[str]: | |
| """Independent Bernoulli draw per biomarker based on literature prevalence.""" | |
| return [bm for bm, p in prevalences.items() if rng.random() < p] | |
| def _pick_comorbidities(rng: random.Random, age: int) -> list[str]: | |
| """Age-scaled comorbidity draw.""" | |
| scale = 1.0 + max(0, (age - 50)) * 0.015 # comorbidities rise ~1.5% per year after 50 | |
| return [c for c, p in _COMORBIDITY_POOL if rng.random() < min(p * scale, 0.95)] | |
| def _generate_patient(pid: str, condition: str, profile: dict, seq: int, rng: random.Random) -> dict: | |
| sex_raw = profile["sex"] | |
| sex = rng.choice(["MALE","FEMALE"]) if sex_raw == "ALL" else sex_raw | |
| age = _skewed_age(profile["age_range"], profile["age_mode"]) | |
| stage = rng.choices(profile["stages"], weights=profile["stage_weights"])[0] | |
| ecog_weights = _ECOG_BY_CONDITION.get(condition, [0.28, 0.40, 0.22, 0.10]) | |
| ecog = rng.choices([0, 1, 2, 3], weights=ecog_weights)[0] | |
| eth_group = rng.choices(_ETH_NAMES, weights=_ETH_WEIGHTS)[0] | |
| ethnicity, names_f, names_m, last_names = eth_group | |
| first = rng.choice(names_f if sex == "FEMALE" else names_m) | |
| last = rng.choice(last_names) | |
| city, state = rng.choices(_CITY_NAMES, weights=_CITY_WEIGHTS)[0] | |
| insurance = rng.choices(_INS_LABELS, weights=_INS_WEIGHTS)[0] | |
| biomarkers = _pick_biomarkers(profile["biomarker_prevalences"], rng) | |
| comorbidities = _pick_comorbidities(rng, age) | |
| med_pool = profile["med_pool"] | |
| n_med = min(rng.randint(1, 2), len(med_pool)) | |
| medications = rng.sample(med_pool, n_med) | |
| prior_chemo = rng.random() < profile.get("prior_chemo_rate", 0.5) | |
| prior_radiation = rng.random() < 0.35 | |
| prior_surgery = rng.random() < 0.50 | |
| prior_lines = rng.randint(0, 3) if prior_chemo else 0 | |
| return { | |
| "id": pid, | |
| "name": f"{first} {last}", | |
| "age": age, | |
| "sex": sex, | |
| "stage": stage, | |
| "ecog": ecog, | |
| "condition": condition, | |
| "icd10_prefix": profile["icd10_prefix"], | |
| "city": city, | |
| "state": state, | |
| "ethnicity": ethnicity, | |
| "insurance": insurance, | |
| "biomarkers": biomarkers, | |
| "medications": medications, | |
| "comorbidities": comorbidities, | |
| "prior_chemo": prior_chemo, | |
| "prior_radiation": prior_radiation, | |
| "prior_surgery": prior_surgery, | |
| "prior_lines_of_therapy": prior_lines, | |
| "source": "synthetic_v2", | |
| } | |
| # ββ Batch write helpers βββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| _BATCH_SIZE = 500 | |
| def _batch_write_patients(patients: list[dict]) -> None: | |
| neo4j_conn.run_query(""" | |
| UNWIND $patients AS p | |
| MERGE (n:Patient {id: p.id}) | |
| SET n += { | |
| name: p.name, age: p.age, sex: p.sex, stage: p.stage, | |
| ecog: p.ecog, condition: p.condition, icd10_prefix: p.icd10_prefix, | |
| city: p.city, state: p.state, ethnicity: p.ethnicity, | |
| insurance: p.insurance, biomarkers: p.biomarkers, | |
| medications: p.medications, comorbidities: p.comorbidities, | |
| prior_chemo: p.prior_chemo, prior_radiation: p.prior_radiation, | |
| prior_surgery: p.prior_surgery, | |
| prior_lines_of_therapy: p.prior_lines_of_therapy, | |
| source: p.source, updated_at: datetime() | |
| } | |
| """, {"patients": patients}) | |
| def _batch_write_biomarker_links(links: list[dict]) -> None: | |
| neo4j_conn.run_query(""" | |
| UNWIND $links AS l | |
| MATCH (p:Patient {id: l.pid}) | |
| MATCH (b:Biomarker {id: l.bm_id}) | |
| MERGE (p)-[:HAS_BIOMARKER]->(b) | |
| """, {"links": links}) | |
| def _batch_write_diagnosis_links(links: list[dict]) -> None: | |
| # links already have resolved diagnosis_code (exact match, no scan needed) | |
| neo4j_conn.run_query(""" | |
| UNWIND $links AS l | |
| MATCH (p:Patient {id: l.pid}) | |
| MATCH (d:Diagnosis {code: l.diagnosis_code}) | |
| MERGE (p)-[:HAS_DIAGNOSIS]->(d) | |
| """, {"links": links}) | |
| def _batch_write_eligibility(edges: list[dict]) -> None: | |
| neo4j_conn.run_query(""" | |
| UNWIND $edges AS e | |
| MATCH (p:Patient {id: e.pid}) | |
| MATCH (t:Trial {id: e.tid}) | |
| MERGE (p)-[r:ELIGIBLE_FOR]->(t) | |
| SET r.score = e.score, r.matched_at = datetime() | |
| """, {"edges": edges}) | |
| # ββ Main patient seeder βββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def seed_patients_and_eligibility(total_patients: int = 100_000) -> int: | |
| print(f"\n[6/6] Generating {total_patients:,} clinically-informed synthetic patients...") | |
| print(" (SEER incidence weights Β· TCGA biomarker prevalence Β· US Census demographics)") | |
| # Pre-load trials grouped by condition | |
| trial_rows = neo4j_conn.run_query(""" | |
| MATCH (t:Trial {status: 'RECRUITING'}) | |
| RETURN t.id AS id, t.condition AS condition, t.sex AS sex, | |
| t.min_age AS min_age, t.max_age AS max_age | |
| """) | |
| trials_by_condition: dict[str, list[dict]] = {} | |
| for row in (trial_rows or []): | |
| cond = (row.get("condition") or "").lower().strip() | |
| trials_by_condition.setdefault(cond, []).append(row) | |
| # Calculate per-condition counts from SEER weights | |
| total_weight = sum(p["count_weight"] for p in _CONDITION_PROFILES.values()) | |
| condition_counts = { | |
| cond: max(1, round(total_patients * prof["count_weight"] / total_weight)) | |
| for cond, prof in _CONDITION_PROFILES.items() | |
| } | |
| # Adjust rounding error so we hit exactly total_patients | |
| allocated = sum(condition_counts.values()) | |
| diff = total_patients - allocated | |
| largest = max(condition_counts, key=lambda c: condition_counts[c]) | |
| condition_counts[largest] += diff | |
| # Pre-load one canonical Diagnosis code per ICD-10 prefix | |
| all_prefixes = list({p["icd10_prefix"] for p in _CONDITION_PROFILES.values()}) | |
| dx_canon: dict[str, str] = {} | |
| for prefix in all_prefixes: | |
| rows = neo4j_conn.run_query( | |
| "MATCH (d:Diagnosis) WHERE d.code STARTS WITH $p RETURN d.code AS code ORDER BY d.code LIMIT 1", | |
| {"p": prefix} | |
| ) | |
| if rows: | |
| dx_canon[prefix] = rows[0]["code"] | |
| # Check existing patients per condition to allow resume | |
| existing_rows = neo4j_conn.run_query(""" | |
| MATCH (p:Patient) WHERE p.source = 'synthetic_v2' | |
| RETURN p.condition AS condition, count(p) AS cnt | |
| """) | |
| existing_by_condition: dict[str, int] = { | |
| r["condition"]: r["cnt"] for r in (existing_rows or []) if r.get("condition") | |
| } | |
| rng = random.Random(42) | |
| grand_total = 0 | |
| grand_edges = 0 | |
| for condition, profile in _CONDITION_PROFILES.items(): | |
| icd_prefix = profile["icd10_prefix"] | |
| n = condition_counts[condition] | |
| already = existing_by_condition.get(condition, 0) | |
| condition_trials = trials_by_condition.get(condition, []) | |
| if already >= n: | |
| print(f" {condition}: {n:,} patients β already done, skipping") | |
| grand_total += n | |
| # advance RNG to stay deterministic | |
| for _ in range(n): | |
| rng.random() | |
| continue | |
| skip = already | |
| todo = n - skip | |
| print(f" {condition}: {n:,} patients ({len(condition_trials)} trials)" | |
| + (f" [resuming from {skip:,}]" if skip else "")) | |
| patient_batch: list[dict] = [] | |
| bm_links: list[dict] = [] | |
| dx_links: list[dict] = [] | |
| elig_edges: list[dict] = [] | |
| # Advance RNG past already-written patients so IDs/values stay consistent | |
| for _ in range(skip): | |
| rng.random() | |
| condition_written = 0 | |
| for i in range(skip, n): | |
| pid = f"P_{icd_prefix}_{grand_total + i + 1:06d}" | |
| p = _generate_patient(pid, condition, profile, i, rng) | |
| patient_batch.append(p) | |
| if icd_prefix in dx_canon: | |
| dx_links.append({"pid": pid, "diagnosis_code": dx_canon[icd_prefix]}) | |
| for bm in p["biomarkers"]: | |
| bm_links.append({"pid": pid, "bm_id": bm}) | |
| # Eligibility edges β apply sex/age/ECOG filters | |
| for trial in condition_trials: | |
| t_sex = (trial.get("sex") or "ALL").upper() | |
| t_min = _parse_age(trial.get("min_age") or "") | |
| t_max = _parse_age(trial.get("max_age") or "") | |
| if t_sex not in ("ALL", "BOTH", p["sex"]): | |
| continue | |
| if t_min is not None and p["age"] < t_min: | |
| continue | |
| if t_max is not None and p["age"] > t_max: | |
| continue | |
| if p["ecog"] > 2: | |
| continue | |
| base = rng.uniform(0.55, 0.90) | |
| bm_bonus = 0.08 if p["biomarkers"] else 0.0 | |
| score = round(min(base + bm_bonus, 0.99), 2) | |
| elig_edges.append({"pid": pid, "tid": trial["id"], "score": score}) | |
| condition_written += 1 | |
| # Flush batches | |
| if len(patient_batch) >= _BATCH_SIZE: | |
| _batch_write_patients(patient_batch) | |
| _batch_write_diagnosis_links(dx_links) | |
| if bm_links: | |
| _batch_write_biomarker_links(bm_links) | |
| if elig_edges: | |
| _batch_write_eligibility(elig_edges) | |
| grand_edges += len(elig_edges) | |
| patient_batch, dx_links, bm_links, elig_edges = [], [], [], [] | |
| # Flush remainder | |
| if patient_batch: | |
| _batch_write_patients(patient_batch) | |
| _batch_write_diagnosis_links(dx_links) | |
| if bm_links: | |
| _batch_write_biomarker_links(bm_links) | |
| if elig_edges: | |
| _batch_write_eligibility(elig_edges) | |
| grand_edges += len(elig_edges) | |
| grand_total += n | |
| print(f" β³ wrote {condition_written:,} patients | total so far: {grand_total:,}/{total_patients:,} | edges: {grand_edges:,}") | |
| print(f"\n β Total patients: {grand_total:,}") | |
| print(f" β Total ELIGIBLE_FOR edges: {grand_edges:,}") | |
| return grand_total | |
| # ββ Main entry point ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| async def run_seeder(conditions: list[str] | None = None): | |
| start = time.time() | |
| print("=" * 60) | |
| print("ClinicalMatch AI β Graph Seeder v2") | |
| print("100 k synthetic patients Β· 20 oncology conditions") | |
| print("=" * 60) | |
| async with httpx.AsyncClient(headers={"User-Agent": "ClinicalMatchAI/2.0 (hackathon@research.org)"}) as client: | |
| n_trials = await seed_trials(client) | |
| n_meds = await seed_medications(client) | |
| n_dx = await seed_diagnoses(client) | |
| n_pubs = await seed_literature(client) | |
| n_bm = seed_biomarkers() | |
| derive_eligibility_relationships() | |
| n_patients = seed_patients_and_eligibility(total_patients=100_000) | |
| elapsed = time.time() - start | |
| print(f"\n{'=' * 60}") | |
| print(f"Seeding complete in {elapsed / 60:.1f} min") | |
| print(f" Trials: {n_trials}") | |
| print(f" Medications: {n_meds}") | |
| print(f" Diagnoses: {n_dx}") | |
| print(f" Publications: {n_pubs}") | |
| print(f" Biomarkers: {n_bm}") | |
| print(f" Patients: {n_patients:,}") | |
| print("=" * 60) | |
| def seed_sync(): | |
| asyncio.run(run_seeder()) | |
| if __name__ == "__main__": | |
| import sys | |
| conditions = sys.argv[1:] if len(sys.argv) > 1 else None | |
| asyncio.run(run_seeder(conditions)) | |