"""Pre-defined biological scenarios for task generation. Each ``Scenario`` bundles a task specification together with the matching hidden ground-truth biology so the simulator can instantiate consistent episodes. The library is intentionally diverse: it covers differential expression, trajectory inference, perturbation response, and biomarker validation across tissues and modalities. """ from __future__ import annotations from dataclasses import dataclass, field from typing import Any, Dict, List, Optional from models import ExpectedFinding, PaperReference, TaskSpec from server.simulator.latent_state import ( CellPopulation, GeneProgram, LatentBiologicalState, TechnicalState, ) @dataclass class Scenario: """A reproducible (task, ground-truth) pair.""" name: str task: TaskSpec biology: LatentBiologicalState technical: TechnicalState = field(default_factory=TechnicalState) hidden_failure_conditions: List[str] = field(default_factory=list) difficulty: str = "medium" tags: List[str] = field(default_factory=list) # ── Scenario library ──────────────────────────────────────────────────────── SCENARIO_LIBRARY: List[Scenario] = [ # ── 1. Cardiac disease DE ─────────────────────────────────────────── Scenario( name="cardiac_disease_de", difficulty="easy", tags=["de", "scRNA-seq", "cardiac"], task=TaskSpec( problem_statement=( "Identify differentially expressed genes between diseased " "and healthy cardiomyocytes using single-cell RNA sequencing." ), modality="scRNA-seq", organism="human", tissue="heart", conditions=["healthy", "dilated_cardiomyopathy"], budget_limit=80_000.0, time_limit_days=120.0, success_criteria=[ "Identify DE genes between conditions", "Validate at least one candidate marker", ], ), biology=LatentBiologicalState( cell_populations=[ CellPopulation( name="cardiomyocyte", proportion=0.35, marker_genes=["TNNT2", "MYH7", "ACTC1"], state="contractile", condition_response={"dilated_cardiomyopathy": 0.8}, ), CellPopulation( name="fibroblast", proportion=0.25, marker_genes=["COL1A1", "DCN", "LUM"], state="quiescent", condition_response={"dilated_cardiomyopathy": 1.3}, ), CellPopulation( name="endothelial", proportion=0.15, marker_genes=["PECAM1", "VWF", "CDH5"], state="quiescent", ), CellPopulation( name="macrophage", proportion=0.10, marker_genes=["CD68", "CD163", "CSF1R"], state="activated", condition_response={"dilated_cardiomyopathy": 1.5}, ), CellPopulation( name="smooth_muscle", proportion=0.15, marker_genes=["ACTA2", "MYH11", "TAGLN"], state="quiescent", ), ], true_de_genes={ "disease_vs_healthy": { "NPPA": 2.5, "NPPB": 3.1, "MYH7": 1.8, "COL1A1": 1.6, "COL3A1": 1.4, "POSTN": 2.0, "CCL2": 1.2, "IL6": 0.9, "TGFB1": 1.1, "ANKRD1": 2.2, "XIRP2": -1.3, "MYL2": -0.8, }, }, true_pathways={ "cardiac_muscle_contraction": 0.4, "extracellular_matrix_organisation": 0.85, "inflammatory_response": 0.7, "TGF_beta_signalling": 0.75, "apoptosis": 0.55, }, true_markers=["NPPA", "NPPB", "POSTN", "COL1A1"], causal_mechanisms=[ "TGF-beta-driven fibrosis", "inflammatory macrophage infiltration", ], n_true_cells=12_000, ), technical=TechnicalState( batch_effects={"batch_1": 0.15, "batch_2": 0.10}, doublet_rate=0.05, dropout_rate=0.08, ), ), # ── 2. Developmental trajectory ───────────────────────────────────── Scenario( name="hematopoiesis_trajectory", difficulty="medium", tags=["trajectory", "scRNA-seq", "hematopoiesis"], task=TaskSpec( problem_statement=( "Infer the developmental trajectory of hematopoietic " "stem cells differentiating into mature blood lineages." ), modality="scRNA-seq", organism="human", tissue="bone_marrow", conditions=["steady_state"], budget_limit=100_000.0, time_limit_days=150.0, success_criteria=[ "Reconstruct branching lineage structure", "Identify key transcription factors driving fate decisions", ], paper_references=[ PaperReference( title=( "Single-cell RNA-sequencing uncovers transcriptional " "states and fate decisions in haematopoiesis" ), citation="Nature Communications (2018)", doi="10.1038/s41467-017-02305-6", url=( "https://www.nature.com/articles/" "s41467-017-02305-6" ), ), ], expected_findings=[ ExpectedFinding( finding=( "Trajectory analysis should recover branching blood " "lineages rooted in HSCs." ), category="trajectory", keywords=["HSC", "branching", "lineage", "trajectory"], ), ExpectedFinding( finding=( "GATA1 should appear as a driver of erythroid fate " "commitment." ), category="regulatory_network", keywords=["GATA1", "erythroid", "commitment"], ), ExpectedFinding( finding=( "CEBPA and SPI1 should support myeloid branch " "decisions." ), category="regulatory_network", keywords=["CEBPA", "SPI1", "myeloid", "branch"], ), ], ), biology=LatentBiologicalState( cell_populations=[ CellPopulation(name="HSC", proportion=0.05, marker_genes=["CD34", "KIT", "THY1"], state="stem"), CellPopulation(name="CMP", proportion=0.10, marker_genes=["CD34", "FLT3"], state="progenitor"), CellPopulation(name="GMP", proportion=0.12, marker_genes=["CSF3R", "CEBPA"], state="progenitor"), CellPopulation(name="MEP", proportion=0.10, marker_genes=["GATA1", "KLF1"], state="progenitor"), CellPopulation(name="erythrocyte", proportion=0.20, marker_genes=["HBA1", "HBB", "GYPA"], state="mature"), CellPopulation(name="neutrophil", proportion=0.18, marker_genes=["ELANE", "MPO", "CTSG"], state="mature"), CellPopulation(name="monocyte", proportion=0.15, marker_genes=["CD14", "CSF1R", "FCGR3A"], state="mature"), CellPopulation(name="megakaryocyte", proportion=0.10, marker_genes=["ITGA2B", "GP1BA"], state="mature"), ], true_de_genes={}, true_pathways={ "hematopoietic_cell_lineage": 0.9, "MAPK_signalling": 0.6, "JAK_STAT_signalling": 0.7, }, true_trajectory={ "root": "HSC", "n_lineages": 3, "branching": True, "branches": [ ["HSC", "CMP", "GMP", "neutrophil"], ["HSC", "CMP", "GMP", "monocyte"], ["HSC", "MEP", "erythrocyte"], ["HSC", "MEP", "megakaryocyte"], ], }, true_regulatory_network={ "GATA1": ["KLF1", "HBB", "HBA1", "GYPA"], "CEBPA": ["CSF3R", "ELANE", "MPO"], "SPI1": ["CSF1R", "CD14", "FCGR3A"], "RUNX1": ["CD34", "KIT"], }, true_markers=["GATA1", "CEBPA", "SPI1"], causal_mechanisms=[ "GATA1-driven erythroid commitment", "PU.1/CEBPA antagonism at myeloid branch point", ], n_true_cells=15_000, ), technical=TechnicalState(dropout_rate=0.12, doublet_rate=0.06), ), # ── 3. Perturbation response ──────────────────────────────────────── Scenario( name="perturbation_immune", difficulty="hard", tags=["perturbation", "scRNA-seq", "immune"], task=TaskSpec( problem_statement=( "Determine the effect of JAK inhibitor treatment on " "T-cell activation states in rheumatoid arthritis." ), modality="scRNA-seq", organism="human", tissue="synovial_fluid", conditions=["untreated_RA", "JAK_inhibitor_treated"], budget_limit=120_000.0, time_limit_days=180.0, prior_observations=[ "Elevated JAK-STAT signalling observed in prior bulk RNA-seq", ], success_criteria=[ "Quantify shift in T-cell activation states", "Identify pathways modulated by JAK inhibitor", "Propose validation strategy", ], ), biology=LatentBiologicalState( cell_populations=[ CellPopulation(name="CD4_Th1", proportion=0.20, marker_genes=["IFNG", "TBX21", "IL2"], state="activated", condition_response={"JAK_inhibitor_treated": 0.5}), CellPopulation(name="CD4_Th17", proportion=0.15, marker_genes=["IL17A", "RORC", "CCR6"], state="activated", condition_response={"JAK_inhibitor_treated": 0.6}), CellPopulation(name="CD4_Treg", proportion=0.08, marker_genes=["FOXP3", "IL2RA", "CTLA4"], state="regulatory", condition_response={"JAK_inhibitor_treated": 1.2}), CellPopulation(name="CD8_cytotoxic", proportion=0.18, marker_genes=["GZMB", "PRF1", "CD8A"], state="activated", condition_response={"JAK_inhibitor_treated": 0.7}), CellPopulation(name="macrophage", proportion=0.15, marker_genes=["CD68", "CD163", "MARCO"], state="inflammatory"), CellPopulation(name="fibroblast", proportion=0.14, marker_genes=["COL1A1", "FAP", "THY1"], state="activated"), CellPopulation(name="B_cell", proportion=0.10, marker_genes=["CD19", "MS4A1", "CD79A"], state="quiescent"), ], true_de_genes={ "treated_vs_untreated": { "IFNG": -1.8, "TBX21": -1.2, "IL17A": -1.5, "RORC": -0.9, "JAK1": -0.3, "STAT1": -1.0, "STAT3": -0.8, "SOCS1": 1.5, "SOCS3": 1.3, "FOXP3": 0.6, "IL10": 0.7, }, }, true_pathways={ "JAK_STAT_signalling": 0.3, "Th1_differentiation": 0.35, "Th17_differentiation": 0.4, "cytokine_signalling": 0.45, "regulatory_T_cell_function": 0.7, }, perturbation_effects={ "JAK_inhibitor": { "STAT1": -0.8, "STAT3": -0.7, "IFNG": -1.5, "IL17A": -1.3, "SOCS1": 1.2, }, }, true_markers=["STAT1", "SOCS1", "IFNG"], causal_mechanisms=[ "JAK-STAT pathway inhibition reduces Th1/Th17 activation", "Compensatory Treg expansion under JAK inhibition", ], n_true_cells=18_000, ), technical=TechnicalState( batch_effects={"batch_ctrl": 0.12, "batch_treated": 0.18}, ambient_rna_fraction=0.07, dropout_rate=0.10, ), hidden_failure_conditions=[ "High ambient RNA may confound DE in low-abundance transcripts", ], ), # ── 4. Biomarker validation ───────────────────────────────────────── Scenario( name="biomarker_validation_lung", difficulty="medium", tags=["biomarker", "validation", "scRNA-seq", "lung"], task=TaskSpec( problem_statement=( "Design a follow-up validation experiment for candidate " "biomarker SPP1 in idiopathic pulmonary fibrosis (IPF)." ), modality="scRNA-seq", organism="human", tissue="lung", conditions=["healthy", "IPF"], budget_limit=90_000.0, time_limit_days=150.0, prior_observations=[ "A macrophage subpopulation shows elevated expression in IPF tissue relative to controls", "Pro-fibrotic macrophage enrichment has been observed in fibrotic regions by spatial profiling", ], success_criteria=[ "Validate SPP1 as a marker for pro-fibrotic macrophages", "Confirm spatial localisation in fibrotic tissue", ], paper_references=[ PaperReference( title=( "Proliferating SPP1/MERTK-expressing macrophages in " "idiopathic pulmonary fibrosis" ), citation="European Respiratory Journal (2019)", doi="10.1183/13993003.02441-2018", pmid="31221805", url="https://pubmed.ncbi.nlm.nih.gov/31221805/", ), ], expected_findings=[ ExpectedFinding( finding=( "SPP1-positive macrophages should be enriched in IPF " "fibrotic regions." ), category="marker", keywords=["SPP1", "macrophage", "IPF", "fibrotic"], ), ExpectedFinding( finding=( "MERTK should co-occur with the profibrotic macrophage " "state." ), category="marker", keywords=["MERTK", "macrophage", "SPP1"], ), ExpectedFinding( finding=( "Extracellular matrix organization should emerge as a " "top fibrotic program." ), category="pathway", keywords=["extracellular_matrix", "fibrosis", "pathway"], ), ], dataset_metadata={ "literature_grounding": "single_cell_ipf_macrophages", }, ), biology=LatentBiologicalState( cell_populations=[ CellPopulation(name="alveolar_macrophage", proportion=0.18, marker_genes=["MARCO", "FABP4", "MCEMP1"], state="resident"), CellPopulation(name="SPP1_macrophage", proportion=0.12, marker_genes=["SPP1", "MERTK", "MMP9", "TREM2"], state="pro-fibrotic", condition_response={"IPF": 2.0}), CellPopulation(name="AT2", proportion=0.20, marker_genes=["SFTPC", "SFTPB", "ABCA3"], state="normal"), CellPopulation(name="fibroblast", proportion=0.22, marker_genes=["COL1A1", "COL3A1", "POSTN"], state="activated", condition_response={"IPF": 1.5}), CellPopulation(name="endothelial", proportion=0.13, marker_genes=["PECAM1", "CLDN5"], state="quiescent"), CellPopulation(name="T_cell", proportion=0.15, marker_genes=["CD3D", "CD3E", "IL7R"], state="quiescent"), ], true_de_genes={ "IPF_vs_healthy": { "SPP1": 3.2, "MERTK": 1.4, "MMP9": 1.8, "TREM2": 1.5, "COL1A1": 2.1, "COL3A1": 1.9, "POSTN": 2.4, "SFTPC": -1.2, "AGER": -1.6, }, }, true_pathways={ "extracellular_matrix_organisation": 0.9, "integrin_signalling": 0.75, "macrophage_activation": 0.8, "Wnt_signalling": 0.6, }, true_markers=["SPP1", "MERTK", "POSTN", "MMP9"], causal_mechanisms=[ "SPP1+ macrophage-driven fibroblast activation", "Integrin-mediated SPP1 signalling in fibrosis", ], n_true_cells=14_000, ), technical=TechnicalState( batch_effects={"batch_1": 0.10}, dropout_rate=0.09, sample_quality=0.85, ), ), ]