""" Data models for the Bio-Experiment Planning RL Environment. Defines the POMDP action and observation contracts for a scientific agent that constructs biological experiment pipelines step-by-step. """ from __future__ import annotations from enum import Enum from typing import Any, Dict, List, Optional from pydantic import BaseModel, Field from openenv.core.env_server.types import Action, Observation # ── Action vocabulary ─────────────────────────────────────────────────────── class ActionType(str, Enum): COLLECT_SAMPLE = "collect_sample" SELECT_COHORT = "select_cohort" PREPARE_LIBRARY = "prepare_library" CULTURE_CELLS = "culture_cells" PERTURB_GENE = "perturb_gene" PERTURB_COMPOUND = "perturb_compound" SEQUENCE_CELLS = "sequence_cells" RUN_QC = "run_qc" FILTER_DATA = "filter_data" NORMALIZE_DATA = "normalize_data" INTEGRATE_BATCHES = "integrate_batches" CLUSTER_CELLS = "cluster_cells" DIFFERENTIAL_EXPRESSION = "differential_expression" TRAJECTORY_ANALYSIS = "trajectory_analysis" PATHWAY_ENRICHMENT = "pathway_enrichment" REGULATORY_NETWORK_INFERENCE = "regulatory_network_inference" MARKER_SELECTION = "marker_selection" VALIDATE_MARKER = "validate_marker" DESIGN_FOLLOWUP = "design_followup_experiment" REQUEST_SUBAGENT_REVIEW = "request_subagent_review" SYNTHESIZE_CONCLUSION = "synthesize_conclusion" WET_LAB_ACTIONS = frozenset({ ActionType.COLLECT_SAMPLE, ActionType.SELECT_COHORT, ActionType.PREPARE_LIBRARY, ActionType.CULTURE_CELLS, ActionType.PERTURB_GENE, ActionType.PERTURB_COMPOUND, ActionType.SEQUENCE_CELLS, ActionType.VALIDATE_MARKER, }) COMPUTATIONAL_ACTIONS = frozenset({ ActionType.RUN_QC, ActionType.FILTER_DATA, ActionType.NORMALIZE_DATA, ActionType.INTEGRATE_BATCHES, ActionType.CLUSTER_CELLS, ActionType.DIFFERENTIAL_EXPRESSION, ActionType.TRAJECTORY_ANALYSIS, ActionType.PATHWAY_ENRICHMENT, ActionType.REGULATORY_NETWORK_INFERENCE, ActionType.MARKER_SELECTION, }) META_ACTIONS = frozenset({ ActionType.DESIGN_FOLLOWUP, ActionType.REQUEST_SUBAGENT_REVIEW, ActionType.SYNTHESIZE_CONCLUSION, }) # ── Tool, Assay & Modality Registries ────────────────────────────────────── class ToolCategory(str, Enum): ALIGNMENT = "alignment" PREPROCESSING = "preprocessing" NORMALIZATION = "normalization" DIMENSIONALITY_REDUCTION = "dimensionality_reduction" CLUSTERING = "clustering" DIFFERENTIAL_EXPRESSION = "differential_expression" TRAJECTORY = "trajectory" GENE_REGULATORY_NETWORK = "gene_regulatory_network" CELL_COMMUNICATION = "cell_communication" SPATIAL = "spatial" MULTIMODAL_INTEGRATION = "multimodal_integration" GENE_SET_ANALYSIS = "gene_set_analysis" VARIANT_CALLING = "variant_calling" PEAK_CALLING = "peak_calling" IMPUTATION = "imputation" BATCH_CORRECTION = "batch_correction" CELL_TYPE_ANNOTATION = "cell_type_annotation" SIMULATION = "simulation" VISUALIZATION = "visualization" QUALITY_CONTROL = "quality_control" PERTURBATION_ANALYSIS = "perturbation_analysis" class ToolSpec(BaseModel): """Registry entry describing a bioinformatics tool.""" name: str category: ToolCategory modalities: List[str] = Field(default_factory=list) description: str = "" input_types: List[str] = Field(default_factory=list) output_types: List[str] = Field(default_factory=list) typical_runtime_hours: float = 0.1 typical_cost_usd: float = 0.0 requires_gpu: bool = False open_source: bool = True TOOL_REGISTRY: Dict[str, ToolSpec] = { # ── Alignment & quantification ── "CellRanger": ToolSpec( name="CellRanger", category=ToolCategory.ALIGNMENT, modalities=["scRNA-seq", "scATAC-seq", "CITE-seq", "scMultiome"], description="10x Genomics pipeline for alignment, barcode demux, and counting", input_types=["fastq"], output_types=["count_matrix", "bam"], typical_runtime_hours=4.0, open_source=False, ), "STARsolo": ToolSpec( name="STARsolo", category=ToolCategory.ALIGNMENT, modalities=["scRNA-seq", "scATAC-seq"], description="Drop-seq / 10x-compatible aligner built into STAR", input_types=["fastq"], output_types=["count_matrix", "bam"], typical_runtime_hours=3.0, ), "kallisto_bustools": ToolSpec( name="kallisto_bustools", category=ToolCategory.ALIGNMENT, modalities=["scRNA-seq"], description="Pseudoalignment-based lightweight quantification", input_types=["fastq"], output_types=["count_matrix"], typical_runtime_hours=1.0, ), "Salmon_alevin": ToolSpec( name="Salmon_alevin", category=ToolCategory.ALIGNMENT, modalities=["scRNA-seq"], description="Quasi-mapping quantification for single-cell RNA-seq", input_types=["fastq"], output_types=["count_matrix"], typical_runtime_hours=1.5, ), "spaceranger": ToolSpec( name="spaceranger", category=ToolCategory.ALIGNMENT, modalities=["spatial_transcriptomics"], description="10x Visium spatial alignment and quantification", input_types=["fastq", "image"], output_types=["count_matrix", "spatial_coords"], typical_runtime_hours=3.0, open_source=False, ), # ── Preprocessing / analysis frameworks ── "Scanpy": ToolSpec( name="Scanpy", category=ToolCategory.PREPROCESSING, modalities=["scRNA-seq", "scATAC-seq", "spatial_transcriptomics"], description="Python single-cell analysis framework", input_types=["count_matrix", "h5ad"], output_types=["h5ad", "embedding", "cluster_result"], typical_runtime_hours=0.5, ), "Seurat": ToolSpec( name="Seurat", category=ToolCategory.PREPROCESSING, modalities=["scRNA-seq", "CITE-seq", "spatial_transcriptomics", "scATAC-seq"], description="R single-cell analysis toolkit with multimodal support", input_types=["count_matrix", "h5seurat"], output_types=["h5seurat", "embedding", "cluster_result"], typical_runtime_hours=0.5, ), "Bioconductor_SingleCellExperiment": ToolSpec( name="Bioconductor_SingleCellExperiment", category=ToolCategory.PREPROCESSING, modalities=["scRNA-seq"], description="R/Bioconductor framework for single-cell experiments", input_types=["count_matrix"], output_types=["sce_object"], typical_runtime_hours=0.3, ), # ── Normalization ── "scran": ToolSpec( name="scran", category=ToolCategory.NORMALIZATION, modalities=["scRNA-seq"], description="Pool-based size-factor normalization", input_types=["count_matrix"], output_types=["normalized_matrix"], ), "sctransform": ToolSpec( name="sctransform", category=ToolCategory.NORMALIZATION, modalities=["scRNA-seq"], description="Variance-stabilizing transformation via regularized NB regression", input_types=["count_matrix"], output_types=["normalized_matrix"], ), # ── Dimensionality reduction ── "scVI": ToolSpec( name="scVI", category=ToolCategory.DIMENSIONALITY_REDUCTION, modalities=["scRNA-seq", "CITE-seq", "scATAC-seq"], description="Deep generative model for scRNA-seq (variational inference)", input_types=["count_matrix"], output_types=["latent_embedding"], requires_gpu=True, ), "UMAP": ToolSpec( name="UMAP", category=ToolCategory.DIMENSIONALITY_REDUCTION, modalities=["scRNA-seq", "scATAC-seq", "CITE-seq", "spatial_transcriptomics"], description="Uniform manifold approximation for 2D/3D visualization", input_types=["pca_embedding", "latent_embedding"], output_types=["2d_embedding"], ), # ── Clustering ── "Leiden": ToolSpec( name="Leiden", category=ToolCategory.CLUSTERING, modalities=["scRNA-seq", "scATAC-seq", "CITE-seq"], description="Community detection via the Leiden algorithm", input_types=["knn_graph"], output_types=["cluster_result"], ), "Louvain": ToolSpec( name="Louvain", category=ToolCategory.CLUSTERING, modalities=["scRNA-seq", "scATAC-seq"], description="Community detection via Louvain modularity optimization", input_types=["knn_graph"], output_types=["cluster_result"], ), # ── Differential expression ── "DESeq2": ToolSpec( name="DESeq2", category=ToolCategory.DIFFERENTIAL_EXPRESSION, modalities=["bulk_rna_seq", "scRNA-seq"], description="Negative binomial GLM-based differential expression", input_types=["count_matrix"], output_types=["de_result"], ), "MAST": ToolSpec( name="MAST", category=ToolCategory.DIFFERENTIAL_EXPRESSION, modalities=["scRNA-seq"], description="Two-part hurdle model for scRNA-seq DE testing", input_types=["count_matrix"], output_types=["de_result"], ), "edgeR": ToolSpec( name="edgeR", category=ToolCategory.DIFFERENTIAL_EXPRESSION, modalities=["bulk_rna_seq", "scRNA-seq"], description="Empirical Bayes quasi-likelihood DE testing", input_types=["count_matrix"], output_types=["de_result"], ), "Wilcoxon": ToolSpec( name="Wilcoxon", category=ToolCategory.DIFFERENTIAL_EXPRESSION, modalities=["scRNA-seq"], description="Rank-sum test for marker gene detection", input_types=["count_matrix"], output_types=["de_result"], ), # ── Trajectory & RNA velocity ── "Monocle3": ToolSpec( name="Monocle3", category=ToolCategory.TRAJECTORY, modalities=["scRNA-seq"], description="Reversed graph embedding for pseudotime trajectories", input_types=["count_matrix", "embedding"], output_types=["trajectory_result", "pseudotime"], ), "scVelo": ToolSpec( name="scVelo", category=ToolCategory.TRAJECTORY, modalities=["scRNA-seq"], description="RNA velocity estimation via spliced/unspliced dynamics", input_types=["count_matrix"], output_types=["velocity_result"], ), "CellRank": ToolSpec( name="CellRank", category=ToolCategory.TRAJECTORY, modalities=["scRNA-seq"], description="Fate probability estimation combining velocity and transcriptomics", input_types=["velocity_result", "count_matrix"], output_types=["fate_probabilities"], ), "Slingshot": ToolSpec( name="Slingshot", category=ToolCategory.TRAJECTORY, modalities=["scRNA-seq"], description="Minimum spanning tree-based trajectory inference", input_types=["embedding", "cluster_result"], output_types=["trajectory_result", "pseudotime"], ), "PAGA": ToolSpec( name="PAGA", category=ToolCategory.TRAJECTORY, modalities=["scRNA-seq"], description="Partition-based graph abstraction for topology estimation", input_types=["knn_graph", "cluster_result"], output_types=["trajectory_result"], ), # ── Gene regulatory networks ── "SCENIC": ToolSpec( name="SCENIC", category=ToolCategory.GENE_REGULATORY_NETWORK, modalities=["scRNA-seq"], description="Single-cell regulatory network inference and clustering", input_types=["count_matrix"], output_types=["regulon_result", "network_result"], typical_runtime_hours=6.0, ), "CellOracle": ToolSpec( name="CellOracle", category=ToolCategory.GENE_REGULATORY_NETWORK, modalities=["scRNA-seq", "scATAC-seq", "scMultiome"], description="GRN-based in-silico perturbation prediction", input_types=["count_matrix", "peak_matrix"], output_types=["network_result", "perturbation_prediction"], typical_runtime_hours=4.0, ), # ── Cell-cell communication ── "CellChat": ToolSpec( name="CellChat", category=ToolCategory.CELL_COMMUNICATION, modalities=["scRNA-seq", "spatial_transcriptomics"], description="Ligand-receptor interaction inference with communication patterns", input_types=["count_matrix", "cluster_result"], output_types=["communication_result"], ), "NicheNet": ToolSpec( name="NicheNet", category=ToolCategory.CELL_COMMUNICATION, modalities=["scRNA-seq"], description="Ligand-target link prediction using prior knowledge", input_types=["count_matrix", "de_result"], output_types=["communication_result"], ), "LIANA": ToolSpec( name="LIANA", category=ToolCategory.CELL_COMMUNICATION, modalities=["scRNA-seq", "spatial_transcriptomics"], description="Framework unifying multiple ligand-receptor methods", input_types=["count_matrix", "cluster_result"], output_types=["communication_result"], ), # ── Spatial analysis ── "squidpy": ToolSpec( name="squidpy", category=ToolCategory.SPATIAL, modalities=["spatial_transcriptomics"], description="Spatial omics analysis (neighborhood, co-occurrence, image features)", input_types=["count_matrix", "spatial_coords"], output_types=["spatial_result"], ), "cell2location": ToolSpec( name="cell2location", category=ToolCategory.SPATIAL, modalities=["spatial_transcriptomics"], description="Spatial deconvolution mapping cell types to tissue locations", input_types=["count_matrix", "spatial_coords", "reference_h5ad"], output_types=["deconvolution_result"], requires_gpu=True, ), "BANKSY": ToolSpec( name="BANKSY", category=ToolCategory.SPATIAL, modalities=["spatial_transcriptomics"], description="Spatially-aware clustering combining cell and neighbor features", input_types=["count_matrix", "spatial_coords"], output_types=["cluster_result"], ), # ── Multimodal integration ── "Harmony": ToolSpec( name="Harmony", category=ToolCategory.BATCH_CORRECTION, modalities=["scRNA-seq", "scATAC-seq", "CITE-seq"], description="Fast iterative batch correction on PCA embeddings", input_types=["pca_embedding"], output_types=["corrected_embedding"], ), "scanorama": ToolSpec( name="scanorama", category=ToolCategory.BATCH_CORRECTION, modalities=["scRNA-seq"], description="Panoramic stitching of scRNA-seq batches", input_types=["count_matrix"], output_types=["corrected_embedding", "corrected_matrix"], ), "BBKNN": ToolSpec( name="BBKNN", category=ToolCategory.BATCH_CORRECTION, modalities=["scRNA-seq"], description="Batch-balanced KNN graph construction", input_types=["pca_embedding"], output_types=["knn_graph"], ), "WNN": ToolSpec( name="WNN", category=ToolCategory.MULTIMODAL_INTEGRATION, modalities=["CITE-seq", "scMultiome"], description="Weighted nearest neighbors for multimodal integration (Seurat v4+)", input_types=["rna_embedding", "protein_embedding"], output_types=["multimodal_embedding"], ), "MOFA+": ToolSpec( name="MOFA+", category=ToolCategory.MULTIMODAL_INTEGRATION, modalities=["scMultiome", "CITE-seq"], description="Multi-omics factor analysis for unsupervised integration", input_types=["count_matrix", "peak_matrix"], output_types=["factor_result"], ), "ArchR": ToolSpec( name="ArchR", category=ToolCategory.PREPROCESSING, modalities=["scATAC-seq", "scMultiome"], description="Full-featured scATAC-seq analysis framework in R", input_types=["fragments", "bam"], output_types=["peak_matrix", "gene_activity_matrix"], typical_runtime_hours=2.0, ), "Signac": ToolSpec( name="Signac", category=ToolCategory.PREPROCESSING, modalities=["scATAC-seq", "scMultiome"], description="Seurat extension for chromatin accessibility analysis", input_types=["fragments", "peak_matrix"], output_types=["peak_matrix", "motif_result"], ), "chromVAR": ToolSpec( name="chromVAR", category=ToolCategory.PEAK_CALLING, modalities=["scATAC-seq", "scMultiome"], description="TF motif accessibility deviation scoring", input_types=["peak_matrix"], output_types=["motif_deviation_scores"], ), # ── Gene set / pathway analysis ── "GSEA": ToolSpec( name="GSEA", category=ToolCategory.GENE_SET_ANALYSIS, modalities=["bulk_rna_seq", "scRNA-seq"], description="Gene Set Enrichment Analysis (preranked or phenotype-based)", input_types=["de_result", "ranked_gene_list"], output_types=["pathway_result"], ), "clusterProfiler": ToolSpec( name="clusterProfiler", category=ToolCategory.GENE_SET_ANALYSIS, modalities=["bulk_rna_seq", "scRNA-seq"], description="ORA & GSEA with GO, KEGG, Reactome, and custom gene sets", input_types=["de_result", "gene_list"], output_types=["pathway_result"], ), "decoupleR": ToolSpec( name="decoupleR", category=ToolCategory.GENE_SET_ANALYSIS, modalities=["scRNA-seq", "bulk_rna_seq", "spatial_transcriptomics"], description="Unified framework for functional activity inference (TF, pathway)", input_types=["count_matrix", "de_result"], output_types=["activity_scores"], ), # ── Cell type annotation ── "celltypist": ToolSpec( name="celltypist", category=ToolCategory.CELL_TYPE_ANNOTATION, modalities=["scRNA-seq"], description="Automated cell type classification with pre-trained models", input_types=["count_matrix"], output_types=["annotation_result"], ), "SingleR": ToolSpec( name="SingleR", category=ToolCategory.CELL_TYPE_ANNOTATION, modalities=["scRNA-seq"], description="Reference-based cell type annotation using correlation", input_types=["count_matrix", "reference_dataset"], output_types=["annotation_result"], ), "scArches": ToolSpec( name="scArches", category=ToolCategory.CELL_TYPE_ANNOTATION, modalities=["scRNA-seq", "scATAC-seq", "CITE-seq"], description="Reference mapping and label transfer via deep learning", input_types=["count_matrix", "reference_model"], output_types=["annotation_result", "latent_embedding"], requires_gpu=True, ), # ── Imputation ── "MAGIC": ToolSpec( name="MAGIC", category=ToolCategory.IMPUTATION, modalities=["scRNA-seq"], description="Markov affinity-based graph imputation of dropout zeros", input_types=["count_matrix"], output_types=["imputed_matrix"], ), # ── Perturbation analysis ── "MILO": ToolSpec( name="MILO", category=ToolCategory.PERTURBATION_ANALYSIS, modalities=["scRNA-seq"], description="Differential abundance testing on KNN graph neighborhoods", input_types=["count_matrix", "knn_graph"], output_types=["da_result"], ), "Mixscape": ToolSpec( name="Mixscape", category=ToolCategory.PERTURBATION_ANALYSIS, modalities=["Perturb-seq", "CROP-seq"], description="Seurat extension for CRISPR screen perturbation analysis", input_types=["count_matrix", "guide_assignments"], output_types=["perturbation_result"], ), "MIMOSCA": ToolSpec( name="MIMOSCA", category=ToolCategory.PERTURBATION_ANALYSIS, modalities=["Perturb-seq", "CROP-seq"], description="Multi-input multi-output single-cell analysis for screens", input_types=["count_matrix", "guide_assignments"], output_types=["perturbation_result"], ), # ── Quality control ── "scrublet": ToolSpec( name="scrublet", category=ToolCategory.QUALITY_CONTROL, modalities=["scRNA-seq"], description="Computational doublet detection via synthetic doublets", input_types=["count_matrix"], output_types=["doublet_scores"], ), "DoubletFinder": ToolSpec( name="DoubletFinder", category=ToolCategory.QUALITY_CONTROL, modalities=["scRNA-seq"], description="Artificial nearest-neighbor doublet detection", input_types=["count_matrix"], output_types=["doublet_scores"], ), "SoupX": ToolSpec( name="SoupX", category=ToolCategory.QUALITY_CONTROL, modalities=["scRNA-seq"], description="Ambient RNA contamination estimation and removal", input_types=["count_matrix", "raw_count_matrix"], output_types=["corrected_matrix"], ), "DecontX": ToolSpec( name="DecontX", category=ToolCategory.QUALITY_CONTROL, modalities=["scRNA-seq"], description="Bayesian ambient RNA decontamination", input_types=["count_matrix"], output_types=["corrected_matrix"], ), # ── Simulation ── "Splatter": ToolSpec( name="Splatter", category=ToolCategory.SIMULATION, modalities=["scRNA-seq"], description="Flexible scRNA-seq data simulation framework", input_types=["simulation_params"], output_types=["simulated_count_matrix"], ), } class Modality(str, Enum): SCRNA_SEQ = "scRNA-seq" SCATAC_SEQ = "scATAC-seq" CITE_SEQ = "CITE-seq" SPATIAL_TRANSCRIPTOMICS = "spatial_transcriptomics" BULK_RNA_SEQ = "bulk_rna_seq" SCRNA_MULTIOME = "scMultiome" PERTURB_SEQ = "Perturb-seq" CROP_SEQ = "CROP-seq" SMART_SEQ2 = "Smart-seq2" SLIDE_SEQ = "Slide-seq" MERFISH = "MERFISH" SEQFISH = "seqFISH" PATCH_SEQ = "Patch-seq" SHARE_SEQ = "SHARE-seq" SNARE_SEQ = "SNARE-seq" SC_HI_C = "scHi-C" SCBS_SEQ = "scBS-seq" SCNMT_SEQ = "scNMT-seq" class ModalitySpec(BaseModel): """Registry entry for a single-cell or bulk assay modality.""" name: str modality: Modality measurement: str = "" resolution: str = "single-cell" multiplexable: bool = False typical_cells: str = "1k-20k" typical_cost_per_sample_usd: float = 5000.0 compatible_tools: List[str] = Field(default_factory=list) description: str = "" MODALITY_REGISTRY: Dict[str, ModalitySpec] = { "scRNA-seq": ModalitySpec( name="scRNA-seq", modality=Modality.SCRNA_SEQ, measurement="mRNA transcripts", typical_cells="5k-20k", typical_cost_per_sample_usd=5000.0, compatible_tools=[ "CellRanger", "STARsolo", "kallisto_bustools", "Scanpy", "Seurat", "scVI", "Leiden", "DESeq2", "MAST", "Monocle3", "scVelo", "SCENIC", "CellChat", "GSEA", "celltypist", "scrublet", ], description="Droplet-based single-cell RNA sequencing (e.g. 10x Chromium)", ), "scATAC-seq": ModalitySpec( name="scATAC-seq", modality=Modality.SCATAC_SEQ, measurement="open chromatin regions", typical_cells="5k-15k", typical_cost_per_sample_usd=6000.0, compatible_tools=[ "CellRanger", "ArchR", "Signac", "chromVAR", "Scanpy", "Leiden", ], description="Single-cell Assay for Transposase-Accessible Chromatin", ), "CITE-seq": ModalitySpec( name="CITE-seq", modality=Modality.CITE_SEQ, measurement="mRNA + surface proteins (ADT)", multiplexable=True, typical_cells="5k-20k", typical_cost_per_sample_usd=8000.0, compatible_tools=[ "CellRanger", "Seurat", "WNN", "MOFA+", "Scanpy", "Leiden", ], description="Cellular Indexing of Transcriptomes and Epitopes by Sequencing", ), "spatial_transcriptomics": ModalitySpec( name="spatial_transcriptomics", modality=Modality.SPATIAL_TRANSCRIPTOMICS, measurement="spatially resolved transcripts", resolution="spot (55µm) or subcellular", typical_cells="1k-10k spots", typical_cost_per_sample_usd=7000.0, compatible_tools=[ "spaceranger", "squidpy", "cell2location", "BANKSY", "Scanpy", "Seurat", ], description="Spatially resolved transcriptomics (Visium, MERFISH, Slide-seq, etc.)", ), "bulk_rna_seq": ModalitySpec( name="bulk_rna_seq", modality=Modality.BULK_RNA_SEQ, measurement="aggregate mRNA across cells", resolution="bulk", typical_cells="N/A", typical_cost_per_sample_usd=500.0, compatible_tools=["DESeq2", "edgeR", "GSEA", "clusterProfiler"], description="Standard bulk RNA sequencing", ), "scMultiome": ModalitySpec( name="scMultiome", modality=Modality.SCRNA_MULTIOME, measurement="mRNA + open chromatin (joint)", typical_cells="5k-15k", typical_cost_per_sample_usd=10000.0, compatible_tools=[ "CellRanger", "ArchR", "Signac", "Seurat", "MOFA+", "CellOracle", ], description="10x Multiome (joint scRNA + scATAC from same cell)", ), "Perturb-seq": ModalitySpec( name="Perturb-seq", modality=Modality.PERTURB_SEQ, measurement="mRNA + CRISPR guide assignment", multiplexable=True, typical_cells="10k-100k", typical_cost_per_sample_usd=15000.0, compatible_tools=[ "CellRanger", "Scanpy", "Seurat", "Mixscape", "MIMOSCA", ], description="Pooled CRISPR screens with single-cell RNA readout", ), "CROP-seq": ModalitySpec( name="CROP-seq", modality=Modality.CROP_SEQ, measurement="mRNA + CRISPR guide assignment", multiplexable=True, typical_cells="10k-50k", typical_cost_per_sample_usd=12000.0, compatible_tools=[ "CellRanger", "Scanpy", "Seurat", "Mixscape", "MIMOSCA", ], description="CRISPR dropout screen with single-cell RNA readout", ), "Smart-seq2": ModalitySpec( name="Smart-seq2", modality=Modality.SMART_SEQ2, measurement="full-length mRNA transcripts", typical_cells="100-1000", typical_cost_per_sample_usd=10000.0, compatible_tools=["Scanpy", "Seurat", "DESeq2", "MAST", "Monocle3"], description="Plate-based full-length scRNA-seq with high sensitivity", ), "MERFISH": ModalitySpec( name="MERFISH", modality=Modality.MERFISH, measurement="in situ mRNA (imaging-based)", resolution="subcellular", typical_cells="10k-1M", typical_cost_per_sample_usd=20000.0, compatible_tools=["squidpy", "Scanpy", "BANKSY"], description="Multiplexed Error-Robust FISH for spatial transcriptomics", ), "Slide-seq": ModalitySpec( name="Slide-seq", modality=Modality.SLIDE_SEQ, measurement="spatially resolved mRNA (bead array)", resolution="10µm", typical_cells="10k-50k beads", typical_cost_per_sample_usd=8000.0, compatible_tools=["squidpy", "cell2location", "Scanpy"], description="Near-cellular spatial transcriptomics on bead arrays", ), "Patch-seq": ModalitySpec( name="Patch-seq", modality=Modality.PATCH_SEQ, measurement="mRNA + electrophysiology + morphology", typical_cells="10-500", typical_cost_per_sample_usd=50000.0, compatible_tools=["Scanpy", "Seurat"], description="Combined patch-clamp electrophysiology and scRNA-seq", ), "scHi-C": ModalitySpec( name="scHi-C", modality=Modality.SC_HI_C, measurement="3D chromatin contacts", typical_cells="1k-10k", typical_cost_per_sample_usd=15000.0, compatible_tools=["Scanpy"], description="Single-cell chromosome conformation capture", ), "scBS-seq": ModalitySpec( name="scBS-seq", modality=Modality.SCBS_SEQ, measurement="DNA methylation (CpG)", typical_cells="100-5k", typical_cost_per_sample_usd=12000.0, compatible_tools=["Scanpy"], description="Single-cell bisulfite sequencing for DNA methylation", ), "scNMT-seq": ModalitySpec( name="scNMT-seq", modality=Modality.SCNMT_SEQ, measurement="nucleosome + methylation + transcription (joint)", typical_cells="100-1k", typical_cost_per_sample_usd=25000.0, compatible_tools=["MOFA+", "Scanpy"], description="Joint single-cell nucleosome, methylation, and transcription", ), } class AssayCategory(str, Enum): SEQUENCING = "sequencing" IMAGING = "imaging" PERTURBATION = "perturbation" FUNCTIONAL = "functional" EPIGENOMICS = "epigenomics" PROTEOMICS = "proteomics" METABOLOMICS = "metabolomics" class AssaySpec(BaseModel): """Registry entry for a laboratory assay or protocol.""" name: str category: AssayCategory modalities: List[str] = Field(default_factory=list) description: str = "" typical_duration_days: float = 1.0 typical_cost_usd: float = 1000.0 requires_live_cells: bool = False requires_fresh_tissue: bool = False throughput: str = "medium" outputs: List[str] = Field(default_factory=list) ASSAY_REGISTRY: Dict[str, AssaySpec] = { "10x_chromium": AssaySpec( name="10x_chromium", category=AssayCategory.SEQUENCING, modalities=["scRNA-seq", "scATAC-seq", "CITE-seq", "scMultiome"], description="10x Genomics Chromium droplet-based single-cell partitioning", typical_duration_days=2.0, typical_cost_usd=5000.0, requires_live_cells=True, throughput="high (500-20k cells)", outputs=["fastq", "count_matrix"], ), "smart-seq2": AssaySpec( name="smart-seq2", category=AssayCategory.SEQUENCING, modalities=["Smart-seq2"], description="Plate-based full-length cDNA scRNA-seq", typical_duration_days=3.0, typical_cost_usd=10000.0, requires_live_cells=True, throughput="low (96-384 cells)", outputs=["fastq", "count_matrix"], ), "smart-seq3": AssaySpec( name="smart-seq3", category=AssayCategory.SEQUENCING, modalities=["Smart-seq2"], description="Improved full-length scRNA-seq with UMIs", typical_duration_days=3.0, typical_cost_usd=10000.0, requires_live_cells=True, throughput="low (96-384 cells)", outputs=["fastq", "count_matrix"], ), "bulk_rna_seq": AssaySpec( name="bulk_rna_seq", category=AssayCategory.SEQUENCING, modalities=["bulk_rna_seq"], description="Standard bulk RNA sequencing with poly-A or ribo-depletion", typical_duration_days=3.0, typical_cost_usd=500.0, throughput="high", outputs=["fastq", "count_matrix"], ), "atac-seq": AssaySpec( name="atac-seq", category=AssayCategory.EPIGENOMICS, modalities=["scATAC-seq"], description="Assay for Transposase-Accessible Chromatin using sequencing", typical_duration_days=2.0, typical_cost_usd=6000.0, requires_live_cells=True, outputs=["fastq", "fragments", "peak_matrix"], ), "cite-seq": AssaySpec( name="cite-seq", category=AssayCategory.PROTEOMICS, modalities=["CITE-seq"], description="Simultaneous RNA + surface protein via DNA-barcoded antibodies", typical_duration_days=2.0, typical_cost_usd=8000.0, requires_live_cells=True, throughput="high (5k-20k cells)", outputs=["fastq", "count_matrix", "adt_matrix"], ), "10x_multiome": AssaySpec( name="10x_multiome", category=AssayCategory.SEQUENCING, modalities=["scMultiome"], description="Joint scRNA-seq + scATAC-seq from the same cell", typical_duration_days=2.0, typical_cost_usd=10000.0, requires_live_cells=True, throughput="high (5k-15k cells)", outputs=["fastq", "count_matrix", "fragments"], ), "visium": AssaySpec( name="visium", category=AssayCategory.SEQUENCING, modalities=["spatial_transcriptomics"], description="10x Visium spatially barcoded capture on tissue sections", typical_duration_days=3.0, typical_cost_usd=7000.0, requires_fresh_tissue=True, throughput="medium (1k-5k spots)", outputs=["fastq", "count_matrix", "spatial_coords", "image"], ), "visium_hd": AssaySpec( name="visium_hd", category=AssayCategory.SEQUENCING, modalities=["spatial_transcriptomics"], description="High-definition Visium with 2µm bin resolution", typical_duration_days=3.0, typical_cost_usd=10000.0, requires_fresh_tissue=True, throughput="high", outputs=["fastq", "count_matrix", "spatial_coords", "image"], ), "merfish": AssaySpec( name="merfish", category=AssayCategory.IMAGING, modalities=["MERFISH"], description="Multiplexed Error-Robust FISH imaging-based spatial", typical_duration_days=5.0, typical_cost_usd=20000.0, requires_fresh_tissue=True, throughput="high (100-1000 genes, millions of transcripts)", outputs=["transcript_coords", "cell_segmentation"], ), "seqfish_plus": AssaySpec( name="seqfish_plus", category=AssayCategory.IMAGING, modalities=["seqFISH"], description="Sequential FISH for imaging-based spatial transcriptomics", typical_duration_days=5.0, typical_cost_usd=15000.0, requires_fresh_tissue=True, outputs=["transcript_coords"], ), "slide-seq": AssaySpec( name="slide-seq", category=AssayCategory.SEQUENCING, modalities=["Slide-seq"], description="Near-cellular spatial transcriptomics on bead arrays", typical_duration_days=3.0, typical_cost_usd=8000.0, requires_fresh_tissue=True, outputs=["count_matrix", "spatial_coords"], ), "perturb-seq": AssaySpec( name="perturb-seq", category=AssayCategory.PERTURBATION, modalities=["Perturb-seq"], description="Pooled CRISPR screen + scRNA-seq readout", typical_duration_days=14.0, typical_cost_usd=15000.0, requires_live_cells=True, throughput="high (10k-100k cells)", outputs=["fastq", "count_matrix", "guide_assignments"], ), "crop-seq": AssaySpec( name="crop-seq", category=AssayCategory.PERTURBATION, modalities=["CROP-seq"], description="CRISPR dropout screening with scRNA-seq readout", typical_duration_days=14.0, typical_cost_usd=12000.0, requires_live_cells=True, throughput="high (10k-50k cells)", outputs=["fastq", "count_matrix", "guide_assignments"], ), "patch-seq": AssaySpec( name="patch-seq", category=AssayCategory.FUNCTIONAL, modalities=["Patch-seq"], description="Patch-clamp electrophysiology + scRNA-seq on same neuron", typical_duration_days=7.0, typical_cost_usd=50000.0, requires_live_cells=True, throughput="very low (10-100 cells)", outputs=["fastq", "count_matrix", "ephys_trace", "morphology"], ), "sc_hi_c": AssaySpec( name="sc_hi_c", category=AssayCategory.EPIGENOMICS, modalities=["scHi-C"], description="Single-cell chromosome conformation capture", typical_duration_days=5.0, typical_cost_usd=15000.0, outputs=["contact_matrix"], ), "sc_bisulfite": AssaySpec( name="sc_bisulfite", category=AssayCategory.EPIGENOMICS, modalities=["scBS-seq"], description="Single-cell bisulfite sequencing for DNA methylation profiling", typical_duration_days=5.0, typical_cost_usd=12000.0, outputs=["methylation_matrix"], ), "sc_nmt_seq": AssaySpec( name="sc_nmt_seq", category=AssayCategory.EPIGENOMICS, modalities=["scNMT-seq"], description="Joint nucleosome occupancy, methylation, and transcription", typical_duration_days=7.0, typical_cost_usd=25000.0, requires_live_cells=True, throughput="low (100-1k cells)", outputs=["count_matrix", "methylation_matrix", "accessibility_matrix"], ), "flow_cytometry": AssaySpec( name="flow_cytometry", category=AssayCategory.FUNCTIONAL, modalities=[], description="Fluorescence-based cell sorting and phenotyping", typical_duration_days=1.0, typical_cost_usd=500.0, requires_live_cells=True, throughput="very high (millions of cells)", outputs=["cell_counts", "sorted_cells"], ), "mass_cytometry_CyTOF": AssaySpec( name="mass_cytometry_CyTOF", category=AssayCategory.PROTEOMICS, modalities=[], description="Mass-tag cytometry for 40+ protein markers per cell", typical_duration_days=2.0, typical_cost_usd=3000.0, requires_live_cells=True, throughput="high (100k-1M cells)", outputs=["protein_expression_matrix"], ), "western_blot": AssaySpec( name="western_blot", category=AssayCategory.PROTEOMICS, modalities=[], description="Protein detection and semi-quantification by size separation", typical_duration_days=2.0, typical_cost_usd=200.0, outputs=["band_image", "relative_quantification"], ), "qPCR": AssaySpec( name="qPCR", category=AssayCategory.FUNCTIONAL, modalities=[], description="Quantitative PCR for targeted gene expression validation", typical_duration_days=1.0, typical_cost_usd=100.0, throughput="low (target genes)", outputs=["ct_values", "fold_change"], ), "immunofluorescence": AssaySpec( name="immunofluorescence", category=AssayCategory.IMAGING, modalities=[], description="Antibody-based fluorescence imaging of proteins in situ", typical_duration_days=2.0, typical_cost_usd=500.0, outputs=["fluorescence_image"], ), "elisa": AssaySpec( name="elisa", category=AssayCategory.PROTEOMICS, modalities=[], description="Enzyme-linked immunosorbent assay for secreted protein quantification", typical_duration_days=1.0, typical_cost_usd=300.0, throughput="medium (96-384 well)", outputs=["protein_concentration"], ), "cell_viability_assay": AssaySpec( name="cell_viability_assay", category=AssayCategory.FUNCTIONAL, modalities=[], description="MTT/CellTiter-Glo viability and proliferation measurement", typical_duration_days=1.0, typical_cost_usd=200.0, requires_live_cells=True, throughput="high (96-384 well)", outputs=["viability_scores"], ), } # ── Registry helper functions ────────────────────────────────────────────── def tools_for_modality(modality: str) -> List[ToolSpec]: """Return all registered tools compatible with a given modality.""" return [t for t in TOOL_REGISTRY.values() if modality in t.modalities] def assays_for_modality(modality: str) -> List[AssaySpec]: """Return all registered assays that produce a given modality.""" return [a for a in ASSAY_REGISTRY.values() if modality in a.modalities] def tools_by_category(category: ToolCategory) -> List[ToolSpec]: """Return all registered tools in a given category.""" return [t for t in TOOL_REGISTRY.values() if t.category == category] # ── Sub-agents ───────────────────────────────────────────────────────────── class SubagentType(str, Enum): WET_LAB_PLANNER = "wet_lab_planner" COMPUTATIONAL_ANALYST = "computational_analyst" OMICS_QC_AGENT = "omics_qc_agent" CAUSAL_REASONING_AGENT = "causal_reasoning_agent" BUDGET_SCHEDULER = "budget_scheduler" BIOLOGICAL_RULE_CHECKER = "biological_rule_checker" TOOL_EXECUTOR = "tool_executor" RETROSPECTIVE_CRITIC = "retrospective_critic" REPORT_SYNTHESIZER = "report_synthesizer" # ── Action schema ─────────────────────────────────────────────────────────── class ExperimentAction(Action): """Structured, compositional action for one experiment / analysis step. Hybrid representation: discrete *action_type* plus typed arguments, optional sub-agent / tool invocation, and calibration fields. """ action_type: ActionType = Field( ..., description=( "Discrete simulator step type. The environment enforces scientific " "prerequisites between steps, so actions should follow a valid " "pipeline order." ), ) input_targets: List[str] = Field( default_factory=list, description=( "Optional references to prior samples, outputs, or artifacts that " "this step consumes." ), ) method: Optional[str] = Field( None, description=( "Optional named tool or protocol (for example 'Seurat' or " "'CellRanger'). Prefer methods compatible with the current " "modality and available tool list because tool choice can change " "runtime, cost, and scientific fit." ), ) parameters: Dict[str, Any] = Field( default_factory=dict, description=( "Action-specific settings such as comparison labels, perturbation " "targets, or analysis options. Use only parameters that materially " "change the scientific step." ), ) expected_output_type: Optional[str] = Field( None, description=( "Optional expected artifact or summary that should result from the " "step, such as a count matrix, QC report, DE table, or validation " "result." ), ) justification: Optional[str] = Field( None, description=( "Short scientific rationale explaining why this is the right next " "step in the current environment state." ), ) invoked_subagent: Optional[SubagentType] = Field( None, description="Sub-agent to delegate to, if any" ) tool_call_spec: Optional[Dict[str, Any]] = Field( None, description=( "Optional structured tool invocation payload when the action needs " "a more explicit tool execution plan." ), ) confidence: float = Field( 0.5, ge=0.0, le=1.0, description="Agent confidence in this step" ) # ── Intermediate outputs ──────────────────────────────────────────────────── class OutputType(str, Enum): QC_METRICS = "qc_metrics" COUNT_MATRIX_SUMMARY = "count_matrix_summary" EMBEDDING_SUMMARY = "embedding_summary" CLUSTER_RESULT = "cluster_result" DE_RESULT = "de_result" PATHWAY_RESULT = "pathway_result" TRAJECTORY_RESULT = "trajectory_result" VALIDATION_RESULT = "validation_result" NETWORK_RESULT = "network_result" SAMPLE_COLLECTION_RESULT = "sample_collection_result" LIBRARY_PREP_RESULT = "library_prep_result" SEQUENCING_RESULT = "sequencing_result" PERTURBATION_RESULT = "perturbation_result" CULTURE_RESULT = "culture_result" COHORT_RESULT = "cohort_result" FOLLOWUP_DESIGN = "followup_design" MARKER_RESULT = "marker_result" FAILURE_REPORT = "failure_report" SUBAGENT_REPORT = "subagent_report" CONCLUSION = "conclusion" class IntermediateOutput(BaseModel): """A single simulated output from one pipeline step.""" output_type: OutputType step_index: int success: bool = True quality_score: float = Field(1.0, ge=0.0, le=1.0) summary: str = "" data: Dict[str, Any] = Field(default_factory=dict) uncertainty: float = Field(0.0, ge=0.0, le=1.0) warnings: List[str] = Field(default_factory=list) artifacts_available: List[str] = Field(default_factory=list) # ── Observable state components ───────────────────────────────────────────── class ResourceUsage(BaseModel): budget_used: float = 0.0 budget_remaining: float = 100_000.0 time_used_days: float = 0.0 time_remaining_days: float = 180.0 samples_consumed: int = 0 compute_hours_used: float = 0.0 class PipelineStepRecord(BaseModel): step_index: int action_type: ActionType method: Optional[str] = None parameters: Dict[str, Any] = Field(default_factory=dict) output_summary: str = "" output_type: OutputType success: bool = True quality_score: float = 1.0 resource_cost: float = 0.0 time_cost_days: float = 0.0 class PaperReference(BaseModel): """Metadata for a literature source used to ground a task.""" title: str citation: Optional[str] = None doi: Optional[str] = None pmid: Optional[str] = None url: Optional[str] = None class ExpectedFinding(BaseModel): """A paper-backed result that the agent should try to recover.""" finding: str category: str = "claim" keywords: List[str] = Field(default_factory=list) class TaskSpec(BaseModel): """Specification of the biological problem to solve.""" problem_statement: str = "Unspecified biological problem" modality: str = "scRNA-seq" organism: str = "human" tissue: str = "blood" conditions: List[str] = Field(default_factory=list) available_assays: List[str] = Field( default_factory=lambda: list(ASSAY_REGISTRY.keys()), description=( "Assays that are scientifically compatible with this task's " "modality. These are the relevant assay choices for the episode, " "not an unrestricted catalog." ), ) available_tools: List[str] = Field( default_factory=lambda: list(TOOL_REGISTRY.keys()), description=( "Tools filtered to those compatible with the current task " "modality. The agent should treat this list as the preferred tool " "set for the episode." ), ) budget_limit: float = 100_000.0 time_limit_days: float = 180.0 prior_observations: List[str] = Field(default_factory=list) success_criteria: List[str] = Field(default_factory=list) dataset_metadata: Dict[str, Any] = Field(default_factory=dict) paper_references: List[PaperReference] = Field(default_factory=list) expected_findings: List[ExpectedFinding] = Field(default_factory=list) class ConclusionClaim(BaseModel): claim: str = "" top_markers: List[str] = Field(default_factory=list) causal_mechanisms: List[str] = Field(default_factory=list) predicted_pathways: Dict[str, float] = Field(default_factory=dict) evidence_steps: List[int] = Field(default_factory=list) confidence: float = Field(0.5, ge=0.0, le=1.0) claim_type: str = "correlational" supporting_data: Dict[str, Any] = Field(default_factory=dict) # ── Observation schema ────────────────────────────────────────────────────── class ExperimentObservation(Observation): """Full observable state returned to the agent at each timestep. Deliberately excludes hidden latent biological truth, hidden failure conditions, and ground-truth mechanisms. """ task: TaskSpec = Field(default_factory=TaskSpec) step_index: int = 0 pipeline_history: List[PipelineStepRecord] = Field(default_factory=list) available_assays: List[str] = Field( default_factory=list, description=( "Episode-specific assay choices already filtered to the current " "modality and task context." ), ) available_tools: List[str] = Field( default_factory=list, description=( "Episode-specific compatible tools. These are the methods the " "agent should prefer instead of inventing incompatible tools." ), ) resource_usage: ResourceUsage = Field( default_factory=ResourceUsage, description=( "Running budget, time, and compute usage after previous actions." ), ) latest_output: Optional[IntermediateOutput] = None all_outputs: List[IntermediateOutput] = Field(default_factory=list) discovered_markers: List[str] = Field(default_factory=list) candidate_mechanisms: List[str] = Field(default_factory=list) uncertainty_summary: Dict[str, float] = Field(default_factory=dict) subagent_outputs: List[Dict[str, Any]] = Field(default_factory=list) conclusions: List[ConclusionClaim] = Field(default_factory=list) rule_violations: List[str] = Field(default_factory=list) step_reward_breakdown: Dict[str, float] = Field(default_factory=dict) AGENT_ACTION_GUIDANCE: Dict[ActionType, str] = { ActionType.COLLECT_SAMPLE: ( "Wet-lab entry point. One successful collection usually provides enough " "material to continue unless the output shows poor yield or quality." ), ActionType.SELECT_COHORT: ( "Use when subject stratification is part of the scientific question " "before downstream experimental work." ), ActionType.PREPARE_LIBRARY: ( "Requires collected samples and converts biological material into " "sequence-ready libraries." ), ActionType.CULTURE_CELLS: ( "Requires collected samples and adds substantial time; use only when " "live-cell expansion or later perturbation is needed." ), ActionType.PERTURB_GENE: ( "Requires samples. Use for causal tests, not as a default discovery " "step." ), ActionType.PERTURB_COMPOUND: ( "Requires samples. Best for mechanistic follow-up or treatment " "response questions." ), ActionType.SEQUENCE_CELLS: ( "Requires prepared libraries and produces the raw sequencing-derived " "artifacts used by downstream QC and analysis." ), ActionType.RUN_QC: ( "Requires sequencing and returns summarized quality metrics such as " "doublets, mitochondrial fraction, and ambient RNA." ), ActionType.FILTER_DATA: ( "Requires QC and removes poor-quality cells, changing downstream cell " "counts and data retention." ), ActionType.NORMALIZE_DATA: ( "Requires filtered data and unlocks clustering, differential " "expression, trajectory, and network analyses." ), ActionType.INTEGRATE_BATCHES: ( "Requires normalized data. Use when batch effects are likely to " "confound interpretation; it is not always necessary." ), ActionType.CLUSTER_CELLS: ( "Requires normalized data and identifies cell populations or states " "for downstream interpretation." ), ActionType.DIFFERENTIAL_EXPRESSION: ( "Requires normalized data and is the main route to candidate genes " "for pathway analysis and marker selection." ), ActionType.TRAJECTORY_ANALYSIS: ( "Requires normalized data and is most useful when lineage progression " "or pseudotime is central to the task." ), ActionType.PATHWAY_ENRICHMENT: ( "Requires differential expression. Results are less reliable without a " "strong DE gene list." ), ActionType.REGULATORY_NETWORK_INFERENCE: ( "Requires normalized data and is most helpful once cell states or " "trajectories are already characterized." ), ActionType.MARKER_SELECTION: ( "Requires differential expression and turns candidate genes into a " "short list for validation." ), ActionType.VALIDATE_MARKER: ( "Requires discovered markers and is an expensive wet-lab confirmation " "step that should follow strong computational evidence." ), ActionType.DESIGN_FOLLOWUP: ( "Use to propose targeted next experiments once remaining uncertainty " "is clear." ), ActionType.REQUEST_SUBAGENT_REVIEW: ( "Use for critique or planning support, not as a substitute for " "missing experimental evidence." ), ActionType.SYNTHESIZE_CONCLUSION: ( "Use once the evidence is sufficient. Do not spend budget on redundant " "steps just because more actions are possible." ), } AGENT_ENVIRONMENT_RULES: List[str] = [ ( "Each successful action already returns summarized scientific evidence, " "so repeated sampling or repeated analysis is not the default." ), ( "Repeat a step only when the task demands it or when prior outputs show " "poor quality, insufficient yield, unresolved batch effects, or another " "clear failure mode." ), ( "The available tool and assay lists are already filtered to the current " "task modality, so prefer them over inventing incompatible methods." ), ( "Hard scientific prerequisites are enforced by the environment, so " "invalid pipeline orderings will be blocked." ), ] _TOOL_CATEGORY_AGENT_NOTES: Dict[ToolCategory, str] = { ToolCategory.ALIGNMENT: ( "Best immediately after sequencing to turn FASTQ-like inputs into " "count-style matrices for downstream analysis." ), ToolCategory.PREPROCESSING: ( "Useful for general single-cell data handling before specialized " "downstream analyses." ), ToolCategory.NORMALIZATION: ( "Applies after filtering to produce normalized matrices for downstream " "modeling." ), ToolCategory.DIMENSIONALITY_REDUCTION: ( "Builds latent embeddings that support clustering or trajectory work." ), ToolCategory.CLUSTERING: ( "Best once data are normalized and the goal is to resolve cell states " "or populations." ), ToolCategory.DIFFERENTIAL_EXPRESSION: ( "Tests contrasts and produces ranked genes for biological " "interpretation." ), ToolCategory.TRAJECTORY: ( "Useful when the task asks about developmental progression, state " "transitions, or pseudotime." ), ToolCategory.GENE_REGULATORY_NETWORK: ( "Most useful after normalized data and some cell-state structure are " "already established." ), ToolCategory.GENE_SET_ANALYSIS: ( "Best after differential expression to interpret gene lists at the " "pathway level." ), ToolCategory.BATCH_CORRECTION: ( "Use when batch effects would confound interpretation; unnecessary use " "adds extra steps." ), ToolCategory.MULTIMODAL_INTEGRATION: ( "Useful only when combining modalities or batches is part of the " "scientific question." ), ToolCategory.QUALITY_CONTROL: ( "Helps identify low-quality cells or technical artifacts before " "filtering." ), ToolCategory.CELL_TYPE_ANNOTATION: ( "Best after clustering when assigning biological identities to groups." ), ToolCategory.PERTURBATION_ANALYSIS: ( "Use when perturbations were actually applied and the goal is to model " "their transcriptional effects." ), ToolCategory.SPATIAL: ( "Only useful when the modality includes spatial coordinates or tissue " "context." ), } def _format_currency(value: float) -> str: return f"${value:,.0f}" def _format_runtime_hours(hours: float) -> str: if hours < 1.0: return f"{int(round(hours * 60))}m" if float(hours).is_integer(): return f"{int(hours)}h" return f"{hours:.1f}h" def describe_tool_for_agent(tool_name: str) -> str: """Return a compact environment-aware tool description for prompts.""" tool = TOOL_REGISTRY.get(tool_name) if tool is None: return tool_name parts = [f"{tool.name}: {tool.description}."] if tool.input_types or tool.output_types: inputs = ", ".join(tool.input_types) or "upstream artifacts" outputs = ", ".join(tool.output_types) or "analysis artifacts" parts.append(f"Consumes {inputs}; yields {outputs}.") category_note = _TOOL_CATEGORY_AGENT_NOTES.get(tool.category) if category_note: parts.append(category_note) resource_bits: List[str] = [] if tool.typical_cost_usd > 0: resource_bits.append(_format_currency(tool.typical_cost_usd)) if tool.typical_runtime_hours > 0: resource_bits.append(_format_runtime_hours(tool.typical_runtime_hours)) if tool.requires_gpu: resource_bits.append("GPU") if resource_bits: parts.append(f"Typical resources: {', '.join(resource_bits)}.") return " ".join(parts) def describe_assay_for_agent(assay_name: str) -> str: """Return a compact environment-aware assay description for prompts.""" assay = ASSAY_REGISTRY.get(assay_name) if assay is None: return assay_name parts = [f"{assay.name}: {assay.description}."] if assay.outputs: parts.append(f"Produces {', '.join(assay.outputs)}.") requirements: List[str] = [] if assay.requires_live_cells: requirements.append("live cells") if assay.requires_fresh_tissue: requirements.append("fresh tissue") if requirements: parts.append(f"Requires {' and '.join(requirements)}.") parts.append( "Typical resources: " f"{_format_currency(assay.typical_cost_usd)}, " f"{assay.typical_duration_days:.1f}d." ) return " ".join(parts) def build_agent_system_prompt() -> str: """Build the shared agent system prompt for training and inference.""" lines = [ "You are an expert biologist planning a single-cell experiment pipeline.", "", "At each turn you see the experiment state and must pick the next scientifically justified step.", "", "Environment-specific reasoning rules:", ] lines.extend(f" - {rule}" for rule in AGENT_ENVIRONMENT_RULES) lines.append("") lines.append("Action guidance:") lines.extend( f" - {action_type.value}: {AGENT_ACTION_GUIDANCE[action_type]}" for action_type in ActionType ) lines.extend([ "", "Respond with ONLY valid JSON, nothing else:", '{"action_type": "...", "method": null, "parameters": {}, "justification": "...", "confidence": 0.8}', "", "For synthesize_conclusion, use structured claims:", '{"action_type": "synthesize_conclusion", "parameters": {"claims": [{"top_markers": ["GENE1", "GENE2"], "causal_mechanisms": ["mechanism description"], "predicted_pathways": {"pathway_name": 0.8}, "confidence": 0.8, "claim_type": "causal", "claim": "optional free text"}]}, "justification": "...", "confidence": 0.8}', ]) return "\n".join(lines) def build_agent_observation_context( obs: ExperimentObservation, *, max_tools: int = 6, max_assays: int = 3, ) -> str: """Summarize modality-specific tool and assay context for the agent.""" sections: List[str] = [] modality_spec = MODALITY_REGISTRY.get(obs.task.modality) if modality_spec is not None: sections.append( "Modality context: " f"{modality_spec.name} measures {modality_spec.measurement} at " f"{modality_spec.resolution} resolution; typical scale " f"{modality_spec.typical_cells}." ) else: sections.append(f"Modality context: {obs.task.modality}.") tool_names = list(dict.fromkeys(obs.available_tools or obs.task.available_tools)) if tool_names: sections.append("Available tools (already filtered to this modality):") for tool_name in tool_names[:max_tools]: sections.append(f" - {describe_tool_for_agent(tool_name)}") if len(tool_names) > max_tools: remainder = ", ".join(tool_names[max_tools:max_tools + 6]) sections.append( " - Additional compatible tools not shown in full: " f"{remainder}" ) assay_names = list(dict.fromkeys(obs.available_assays or obs.task.available_assays)) if assay_names: sections.append("Available assays:") for assay_name in assay_names[:max_assays]: sections.append(f" - {describe_assay_for_agent(assay_name)}") if len(assay_names) > max_assays: remainder = ", ".join(assay_names[max_assays:max_assays + 4]) sections.append( " - Additional compatible assays not shown in full: " f"{remainder}" ) return "\n".join(sections)