Spaces:
Running on CPU Upgrade
Running on CPU Upgrade
| """ | |
| Data models for the Bio-Experiment Planning RL Environment. | |
| Defines the POMDP action and observation contracts for a scientific agent | |
| that constructs biological experiment pipelines step-by-step. | |
| """ | |
| from __future__ import annotations | |
| from enum import Enum | |
| from typing import Any, Dict, List, Optional | |
| from pydantic import BaseModel, Field | |
| from openenv.core.env_server.types import Action, Observation | |
| # ββ Action vocabulary βββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class ActionType(str, Enum): | |
| COLLECT_SAMPLE = "collect_sample" | |
| SELECT_COHORT = "select_cohort" | |
| PREPARE_LIBRARY = "prepare_library" | |
| CULTURE_CELLS = "culture_cells" | |
| PERTURB_GENE = "perturb_gene" | |
| PERTURB_COMPOUND = "perturb_compound" | |
| SEQUENCE_CELLS = "sequence_cells" | |
| RUN_QC = "run_qc" | |
| FILTER_DATA = "filter_data" | |
| NORMALIZE_DATA = "normalize_data" | |
| INTEGRATE_BATCHES = "integrate_batches" | |
| CLUSTER_CELLS = "cluster_cells" | |
| DIFFERENTIAL_EXPRESSION = "differential_expression" | |
| TRAJECTORY_ANALYSIS = "trajectory_analysis" | |
| PATHWAY_ENRICHMENT = "pathway_enrichment" | |
| REGULATORY_NETWORK_INFERENCE = "regulatory_network_inference" | |
| MARKER_SELECTION = "marker_selection" | |
| VALIDATE_MARKER = "validate_marker" | |
| DESIGN_FOLLOWUP = "design_followup_experiment" | |
| REQUEST_SUBAGENT_REVIEW = "request_subagent_review" | |
| SYNTHESIZE_CONCLUSION = "synthesize_conclusion" | |
| WET_LAB_ACTIONS = frozenset({ | |
| ActionType.COLLECT_SAMPLE, | |
| ActionType.SELECT_COHORT, | |
| ActionType.PREPARE_LIBRARY, | |
| ActionType.CULTURE_CELLS, | |
| ActionType.PERTURB_GENE, | |
| ActionType.PERTURB_COMPOUND, | |
| ActionType.SEQUENCE_CELLS, | |
| ActionType.VALIDATE_MARKER, | |
| }) | |
| COMPUTATIONAL_ACTIONS = frozenset({ | |
| ActionType.RUN_QC, | |
| ActionType.FILTER_DATA, | |
| ActionType.NORMALIZE_DATA, | |
| ActionType.INTEGRATE_BATCHES, | |
| ActionType.CLUSTER_CELLS, | |
| ActionType.DIFFERENTIAL_EXPRESSION, | |
| ActionType.TRAJECTORY_ANALYSIS, | |
| ActionType.PATHWAY_ENRICHMENT, | |
| ActionType.REGULATORY_NETWORK_INFERENCE, | |
| ActionType.MARKER_SELECTION, | |
| }) | |
| META_ACTIONS = frozenset({ | |
| ActionType.DESIGN_FOLLOWUP, | |
| ActionType.REQUEST_SUBAGENT_REVIEW, | |
| ActionType.SYNTHESIZE_CONCLUSION, | |
| }) | |
| # ββ Tool, Assay & Modality Registries ββββββββββββββββββββββββββββββββββββββ | |
| class ToolCategory(str, Enum): | |
| ALIGNMENT = "alignment" | |
| PREPROCESSING = "preprocessing" | |
| NORMALIZATION = "normalization" | |
| DIMENSIONALITY_REDUCTION = "dimensionality_reduction" | |
| CLUSTERING = "clustering" | |
| DIFFERENTIAL_EXPRESSION = "differential_expression" | |
| TRAJECTORY = "trajectory" | |
| GENE_REGULATORY_NETWORK = "gene_regulatory_network" | |
| CELL_COMMUNICATION = "cell_communication" | |
| SPATIAL = "spatial" | |
| MULTIMODAL_INTEGRATION = "multimodal_integration" | |
| GENE_SET_ANALYSIS = "gene_set_analysis" | |
| VARIANT_CALLING = "variant_calling" | |
| PEAK_CALLING = "peak_calling" | |
| IMPUTATION = "imputation" | |
| BATCH_CORRECTION = "batch_correction" | |
| CELL_TYPE_ANNOTATION = "cell_type_annotation" | |
| SIMULATION = "simulation" | |
| VISUALIZATION = "visualization" | |
| QUALITY_CONTROL = "quality_control" | |
| PERTURBATION_ANALYSIS = "perturbation_analysis" | |
| class ToolSpec(BaseModel): | |
| """Registry entry describing a bioinformatics tool.""" | |
| name: str | |
| category: ToolCategory | |
| modalities: List[str] = Field(default_factory=list) | |
| description: str = "" | |
| input_types: List[str] = Field(default_factory=list) | |
| output_types: List[str] = Field(default_factory=list) | |
| typical_runtime_hours: float = 0.1 | |
| typical_cost_usd: float = 0.0 | |
| requires_gpu: bool = False | |
| open_source: bool = True | |
| TOOL_REGISTRY: Dict[str, ToolSpec] = { | |
| # ββ Alignment & quantification ββ | |
| "CellRanger": ToolSpec( | |
| name="CellRanger", | |
| category=ToolCategory.ALIGNMENT, | |
| modalities=["scRNA-seq", "scATAC-seq", "CITE-seq", "scMultiome"], | |
| description="10x Genomics pipeline for alignment, barcode demux, and counting", | |
| input_types=["fastq"], | |
| output_types=["count_matrix", "bam"], | |
| typical_runtime_hours=4.0, | |
| open_source=False, | |
| ), | |
| "STARsolo": ToolSpec( | |
| name="STARsolo", | |
| category=ToolCategory.ALIGNMENT, | |
| modalities=["scRNA-seq", "scATAC-seq"], | |
| description="Drop-seq / 10x-compatible aligner built into STAR", | |
| input_types=["fastq"], | |
| output_types=["count_matrix", "bam"], | |
| typical_runtime_hours=3.0, | |
| ), | |
| "kallisto_bustools": ToolSpec( | |
| name="kallisto_bustools", | |
| category=ToolCategory.ALIGNMENT, | |
| modalities=["scRNA-seq"], | |
| description="Pseudoalignment-based lightweight quantification", | |
| input_types=["fastq"], | |
| output_types=["count_matrix"], | |
| typical_runtime_hours=1.0, | |
| ), | |
| "Salmon_alevin": ToolSpec( | |
| name="Salmon_alevin", | |
| category=ToolCategory.ALIGNMENT, | |
| modalities=["scRNA-seq"], | |
| description="Quasi-mapping quantification for single-cell RNA-seq", | |
| input_types=["fastq"], | |
| output_types=["count_matrix"], | |
| typical_runtime_hours=1.5, | |
| ), | |
| "spaceranger": ToolSpec( | |
| name="spaceranger", | |
| category=ToolCategory.ALIGNMENT, | |
| modalities=["spatial_transcriptomics"], | |
| description="10x Visium spatial alignment and quantification", | |
| input_types=["fastq", "image"], | |
| output_types=["count_matrix", "spatial_coords"], | |
| typical_runtime_hours=3.0, | |
| open_source=False, | |
| ), | |
| # ββ Preprocessing / analysis frameworks ββ | |
| "Scanpy": ToolSpec( | |
| name="Scanpy", | |
| category=ToolCategory.PREPROCESSING, | |
| modalities=["scRNA-seq", "scATAC-seq", "spatial_transcriptomics"], | |
| description="Python single-cell analysis framework", | |
| input_types=["count_matrix", "h5ad"], | |
| output_types=["h5ad", "embedding", "cluster_result"], | |
| typical_runtime_hours=0.5, | |
| ), | |
| "Seurat": ToolSpec( | |
| name="Seurat", | |
| category=ToolCategory.PREPROCESSING, | |
| modalities=["scRNA-seq", "CITE-seq", "spatial_transcriptomics", "scATAC-seq"], | |
| description="R single-cell analysis toolkit with multimodal support", | |
| input_types=["count_matrix", "h5seurat"], | |
| output_types=["h5seurat", "embedding", "cluster_result"], | |
| typical_runtime_hours=0.5, | |
| ), | |
| "Bioconductor_SingleCellExperiment": ToolSpec( | |
| name="Bioconductor_SingleCellExperiment", | |
| category=ToolCategory.PREPROCESSING, | |
| modalities=["scRNA-seq"], | |
| description="R/Bioconductor framework for single-cell experiments", | |
| input_types=["count_matrix"], | |
| output_types=["sce_object"], | |
| typical_runtime_hours=0.3, | |
| ), | |
| # ββ Normalization ββ | |
| "scran": ToolSpec( | |
| name="scran", | |
| category=ToolCategory.NORMALIZATION, | |
| modalities=["scRNA-seq"], | |
| description="Pool-based size-factor normalization", | |
| input_types=["count_matrix"], | |
| output_types=["normalized_matrix"], | |
| ), | |
| "sctransform": ToolSpec( | |
| name="sctransform", | |
| category=ToolCategory.NORMALIZATION, | |
| modalities=["scRNA-seq"], | |
| description="Variance-stabilizing transformation via regularized NB regression", | |
| input_types=["count_matrix"], | |
| output_types=["normalized_matrix"], | |
| ), | |
| # ββ Dimensionality reduction ββ | |
| "scVI": ToolSpec( | |
| name="scVI", | |
| category=ToolCategory.DIMENSIONALITY_REDUCTION, | |
| modalities=["scRNA-seq", "CITE-seq", "scATAC-seq"], | |
| description="Deep generative model for scRNA-seq (variational inference)", | |
| input_types=["count_matrix"], | |
| output_types=["latent_embedding"], | |
| requires_gpu=True, | |
| ), | |
| "UMAP": ToolSpec( | |
| name="UMAP", | |
| category=ToolCategory.DIMENSIONALITY_REDUCTION, | |
| modalities=["scRNA-seq", "scATAC-seq", "CITE-seq", "spatial_transcriptomics"], | |
| description="Uniform manifold approximation for 2D/3D visualization", | |
| input_types=["pca_embedding", "latent_embedding"], | |
| output_types=["2d_embedding"], | |
| ), | |
| # ββ Clustering ββ | |
| "Leiden": ToolSpec( | |
| name="Leiden", | |
| category=ToolCategory.CLUSTERING, | |
| modalities=["scRNA-seq", "scATAC-seq", "CITE-seq"], | |
| description="Community detection via the Leiden algorithm", | |
| input_types=["knn_graph"], | |
| output_types=["cluster_result"], | |
| ), | |
| "Louvain": ToolSpec( | |
| name="Louvain", | |
| category=ToolCategory.CLUSTERING, | |
| modalities=["scRNA-seq", "scATAC-seq"], | |
| description="Community detection via Louvain modularity optimization", | |
| input_types=["knn_graph"], | |
| output_types=["cluster_result"], | |
| ), | |
| # ββ Differential expression ββ | |
| "DESeq2": ToolSpec( | |
| name="DESeq2", | |
| category=ToolCategory.DIFFERENTIAL_EXPRESSION, | |
| modalities=["bulk_rna_seq", "scRNA-seq"], | |
| description="Negative binomial GLM-based differential expression", | |
| input_types=["count_matrix"], | |
| output_types=["de_result"], | |
| ), | |
| "MAST": ToolSpec( | |
| name="MAST", | |
| category=ToolCategory.DIFFERENTIAL_EXPRESSION, | |
| modalities=["scRNA-seq"], | |
| description="Two-part hurdle model for scRNA-seq DE testing", | |
| input_types=["count_matrix"], | |
| output_types=["de_result"], | |
| ), | |
| "edgeR": ToolSpec( | |
| name="edgeR", | |
| category=ToolCategory.DIFFERENTIAL_EXPRESSION, | |
| modalities=["bulk_rna_seq", "scRNA-seq"], | |
| description="Empirical Bayes quasi-likelihood DE testing", | |
| input_types=["count_matrix"], | |
| output_types=["de_result"], | |
| ), | |
| "Wilcoxon": ToolSpec( | |
| name="Wilcoxon", | |
| category=ToolCategory.DIFFERENTIAL_EXPRESSION, | |
| modalities=["scRNA-seq"], | |
| description="Rank-sum test for marker gene detection", | |
| input_types=["count_matrix"], | |
| output_types=["de_result"], | |
| ), | |
| # ββ Trajectory & RNA velocity ββ | |
| "Monocle3": ToolSpec( | |
| name="Monocle3", | |
| category=ToolCategory.TRAJECTORY, | |
| modalities=["scRNA-seq"], | |
| description="Reversed graph embedding for pseudotime trajectories", | |
| input_types=["count_matrix", "embedding"], | |
| output_types=["trajectory_result", "pseudotime"], | |
| ), | |
| "scVelo": ToolSpec( | |
| name="scVelo", | |
| category=ToolCategory.TRAJECTORY, | |
| modalities=["scRNA-seq"], | |
| description="RNA velocity estimation via spliced/unspliced dynamics", | |
| input_types=["count_matrix"], | |
| output_types=["velocity_result"], | |
| ), | |
| "CellRank": ToolSpec( | |
| name="CellRank", | |
| category=ToolCategory.TRAJECTORY, | |
| modalities=["scRNA-seq"], | |
| description="Fate probability estimation combining velocity and transcriptomics", | |
| input_types=["velocity_result", "count_matrix"], | |
| output_types=["fate_probabilities"], | |
| ), | |
| "Slingshot": ToolSpec( | |
| name="Slingshot", | |
| category=ToolCategory.TRAJECTORY, | |
| modalities=["scRNA-seq"], | |
| description="Minimum spanning tree-based trajectory inference", | |
| input_types=["embedding", "cluster_result"], | |
| output_types=["trajectory_result", "pseudotime"], | |
| ), | |
| "PAGA": ToolSpec( | |
| name="PAGA", | |
| category=ToolCategory.TRAJECTORY, | |
| modalities=["scRNA-seq"], | |
| description="Partition-based graph abstraction for topology estimation", | |
| input_types=["knn_graph", "cluster_result"], | |
| output_types=["trajectory_result"], | |
| ), | |
| # ββ Gene regulatory networks ββ | |
| "SCENIC": ToolSpec( | |
| name="SCENIC", | |
| category=ToolCategory.GENE_REGULATORY_NETWORK, | |
| modalities=["scRNA-seq"], | |
| description="Single-cell regulatory network inference and clustering", | |
| input_types=["count_matrix"], | |
| output_types=["regulon_result", "network_result"], | |
| typical_runtime_hours=6.0, | |
| ), | |
| "CellOracle": ToolSpec( | |
| name="CellOracle", | |
| category=ToolCategory.GENE_REGULATORY_NETWORK, | |
| modalities=["scRNA-seq", "scATAC-seq", "scMultiome"], | |
| description="GRN-based in-silico perturbation prediction", | |
| input_types=["count_matrix", "peak_matrix"], | |
| output_types=["network_result", "perturbation_prediction"], | |
| typical_runtime_hours=4.0, | |
| ), | |
| # ββ Cell-cell communication ββ | |
| "CellChat": ToolSpec( | |
| name="CellChat", | |
| category=ToolCategory.CELL_COMMUNICATION, | |
| modalities=["scRNA-seq", "spatial_transcriptomics"], | |
| description="Ligand-receptor interaction inference with communication patterns", | |
| input_types=["count_matrix", "cluster_result"], | |
| output_types=["communication_result"], | |
| ), | |
| "NicheNet": ToolSpec( | |
| name="NicheNet", | |
| category=ToolCategory.CELL_COMMUNICATION, | |
| modalities=["scRNA-seq"], | |
| description="Ligand-target link prediction using prior knowledge", | |
| input_types=["count_matrix", "de_result"], | |
| output_types=["communication_result"], | |
| ), | |
| "LIANA": ToolSpec( | |
| name="LIANA", | |
| category=ToolCategory.CELL_COMMUNICATION, | |
| modalities=["scRNA-seq", "spatial_transcriptomics"], | |
| description="Framework unifying multiple ligand-receptor methods", | |
| input_types=["count_matrix", "cluster_result"], | |
| output_types=["communication_result"], | |
| ), | |
| # ββ Spatial analysis ββ | |
| "squidpy": ToolSpec( | |
| name="squidpy", | |
| category=ToolCategory.SPATIAL, | |
| modalities=["spatial_transcriptomics"], | |
| description="Spatial omics analysis (neighborhood, co-occurrence, image features)", | |
| input_types=["count_matrix", "spatial_coords"], | |
| output_types=["spatial_result"], | |
| ), | |
| "cell2location": ToolSpec( | |
| name="cell2location", | |
| category=ToolCategory.SPATIAL, | |
| modalities=["spatial_transcriptomics"], | |
| description="Spatial deconvolution mapping cell types to tissue locations", | |
| input_types=["count_matrix", "spatial_coords", "reference_h5ad"], | |
| output_types=["deconvolution_result"], | |
| requires_gpu=True, | |
| ), | |
| "BANKSY": ToolSpec( | |
| name="BANKSY", | |
| category=ToolCategory.SPATIAL, | |
| modalities=["spatial_transcriptomics"], | |
| description="Spatially-aware clustering combining cell and neighbor features", | |
| input_types=["count_matrix", "spatial_coords"], | |
| output_types=["cluster_result"], | |
| ), | |
| # ββ Multimodal integration ββ | |
| "Harmony": ToolSpec( | |
| name="Harmony", | |
| category=ToolCategory.BATCH_CORRECTION, | |
| modalities=["scRNA-seq", "scATAC-seq", "CITE-seq"], | |
| description="Fast iterative batch correction on PCA embeddings", | |
| input_types=["pca_embedding"], | |
| output_types=["corrected_embedding"], | |
| ), | |
| "scanorama": ToolSpec( | |
| name="scanorama", | |
| category=ToolCategory.BATCH_CORRECTION, | |
| modalities=["scRNA-seq"], | |
| description="Panoramic stitching of scRNA-seq batches", | |
| input_types=["count_matrix"], | |
| output_types=["corrected_embedding", "corrected_matrix"], | |
| ), | |
| "BBKNN": ToolSpec( | |
| name="BBKNN", | |
| category=ToolCategory.BATCH_CORRECTION, | |
| modalities=["scRNA-seq"], | |
| description="Batch-balanced KNN graph construction", | |
| input_types=["pca_embedding"], | |
| output_types=["knn_graph"], | |
| ), | |
| "WNN": ToolSpec( | |
| name="WNN", | |
| category=ToolCategory.MULTIMODAL_INTEGRATION, | |
| modalities=["CITE-seq", "scMultiome"], | |
| description="Weighted nearest neighbors for multimodal integration (Seurat v4+)", | |
| input_types=["rna_embedding", "protein_embedding"], | |
| output_types=["multimodal_embedding"], | |
| ), | |
| "MOFA+": ToolSpec( | |
| name="MOFA+", | |
| category=ToolCategory.MULTIMODAL_INTEGRATION, | |
| modalities=["scMultiome", "CITE-seq"], | |
| description="Multi-omics factor analysis for unsupervised integration", | |
| input_types=["count_matrix", "peak_matrix"], | |
| output_types=["factor_result"], | |
| ), | |
| "ArchR": ToolSpec( | |
| name="ArchR", | |
| category=ToolCategory.PREPROCESSING, | |
| modalities=["scATAC-seq", "scMultiome"], | |
| description="Full-featured scATAC-seq analysis framework in R", | |
| input_types=["fragments", "bam"], | |
| output_types=["peak_matrix", "gene_activity_matrix"], | |
| typical_runtime_hours=2.0, | |
| ), | |
| "Signac": ToolSpec( | |
| name="Signac", | |
| category=ToolCategory.PREPROCESSING, | |
| modalities=["scATAC-seq", "scMultiome"], | |
| description="Seurat extension for chromatin accessibility analysis", | |
| input_types=["fragments", "peak_matrix"], | |
| output_types=["peak_matrix", "motif_result"], | |
| ), | |
| "chromVAR": ToolSpec( | |
| name="chromVAR", | |
| category=ToolCategory.PEAK_CALLING, | |
| modalities=["scATAC-seq", "scMultiome"], | |
| description="TF motif accessibility deviation scoring", | |
| input_types=["peak_matrix"], | |
| output_types=["motif_deviation_scores"], | |
| ), | |
| # ββ Gene set / pathway analysis ββ | |
| "GSEA": ToolSpec( | |
| name="GSEA", | |
| category=ToolCategory.GENE_SET_ANALYSIS, | |
| modalities=["bulk_rna_seq", "scRNA-seq"], | |
| description="Gene Set Enrichment Analysis (preranked or phenotype-based)", | |
| input_types=["de_result", "ranked_gene_list"], | |
| output_types=["pathway_result"], | |
| ), | |
| "clusterProfiler": ToolSpec( | |
| name="clusterProfiler", | |
| category=ToolCategory.GENE_SET_ANALYSIS, | |
| modalities=["bulk_rna_seq", "scRNA-seq"], | |
| description="ORA & GSEA with GO, KEGG, Reactome, and custom gene sets", | |
| input_types=["de_result", "gene_list"], | |
| output_types=["pathway_result"], | |
| ), | |
| "decoupleR": ToolSpec( | |
| name="decoupleR", | |
| category=ToolCategory.GENE_SET_ANALYSIS, | |
| modalities=["scRNA-seq", "bulk_rna_seq", "spatial_transcriptomics"], | |
| description="Unified framework for functional activity inference (TF, pathway)", | |
| input_types=["count_matrix", "de_result"], | |
| output_types=["activity_scores"], | |
| ), | |
| # ββ Cell type annotation ββ | |
| "celltypist": ToolSpec( | |
| name="celltypist", | |
| category=ToolCategory.CELL_TYPE_ANNOTATION, | |
| modalities=["scRNA-seq"], | |
| description="Automated cell type classification with pre-trained models", | |
| input_types=["count_matrix"], | |
| output_types=["annotation_result"], | |
| ), | |
| "SingleR": ToolSpec( | |
| name="SingleR", | |
| category=ToolCategory.CELL_TYPE_ANNOTATION, | |
| modalities=["scRNA-seq"], | |
| description="Reference-based cell type annotation using correlation", | |
| input_types=["count_matrix", "reference_dataset"], | |
| output_types=["annotation_result"], | |
| ), | |
| "scArches": ToolSpec( | |
| name="scArches", | |
| category=ToolCategory.CELL_TYPE_ANNOTATION, | |
| modalities=["scRNA-seq", "scATAC-seq", "CITE-seq"], | |
| description="Reference mapping and label transfer via deep learning", | |
| input_types=["count_matrix", "reference_model"], | |
| output_types=["annotation_result", "latent_embedding"], | |
| requires_gpu=True, | |
| ), | |
| # ββ Imputation ββ | |
| "MAGIC": ToolSpec( | |
| name="MAGIC", | |
| category=ToolCategory.IMPUTATION, | |
| modalities=["scRNA-seq"], | |
| description="Markov affinity-based graph imputation of dropout zeros", | |
| input_types=["count_matrix"], | |
| output_types=["imputed_matrix"], | |
| ), | |
| # ββ Perturbation analysis ββ | |
| "MILO": ToolSpec( | |
| name="MILO", | |
| category=ToolCategory.PERTURBATION_ANALYSIS, | |
| modalities=["scRNA-seq"], | |
| description="Differential abundance testing on KNN graph neighborhoods", | |
| input_types=["count_matrix", "knn_graph"], | |
| output_types=["da_result"], | |
| ), | |
| "Mixscape": ToolSpec( | |
| name="Mixscape", | |
| category=ToolCategory.PERTURBATION_ANALYSIS, | |
| modalities=["Perturb-seq", "CROP-seq"], | |
| description="Seurat extension for CRISPR screen perturbation analysis", | |
| input_types=["count_matrix", "guide_assignments"], | |
| output_types=["perturbation_result"], | |
| ), | |
| "MIMOSCA": ToolSpec( | |
| name="MIMOSCA", | |
| category=ToolCategory.PERTURBATION_ANALYSIS, | |
| modalities=["Perturb-seq", "CROP-seq"], | |
| description="Multi-input multi-output single-cell analysis for screens", | |
| input_types=["count_matrix", "guide_assignments"], | |
| output_types=["perturbation_result"], | |
| ), | |
| # ββ Quality control ββ | |
| "scrublet": ToolSpec( | |
| name="scrublet", | |
| category=ToolCategory.QUALITY_CONTROL, | |
| modalities=["scRNA-seq"], | |
| description="Computational doublet detection via synthetic doublets", | |
| input_types=["count_matrix"], | |
| output_types=["doublet_scores"], | |
| ), | |
| "DoubletFinder": ToolSpec( | |
| name="DoubletFinder", | |
| category=ToolCategory.QUALITY_CONTROL, | |
| modalities=["scRNA-seq"], | |
| description="Artificial nearest-neighbor doublet detection", | |
| input_types=["count_matrix"], | |
| output_types=["doublet_scores"], | |
| ), | |
| "SoupX": ToolSpec( | |
| name="SoupX", | |
| category=ToolCategory.QUALITY_CONTROL, | |
| modalities=["scRNA-seq"], | |
| description="Ambient RNA contamination estimation and removal", | |
| input_types=["count_matrix", "raw_count_matrix"], | |
| output_types=["corrected_matrix"], | |
| ), | |
| "DecontX": ToolSpec( | |
| name="DecontX", | |
| category=ToolCategory.QUALITY_CONTROL, | |
| modalities=["scRNA-seq"], | |
| description="Bayesian ambient RNA decontamination", | |
| input_types=["count_matrix"], | |
| output_types=["corrected_matrix"], | |
| ), | |
| # ββ Simulation ββ | |
| "Splatter": ToolSpec( | |
| name="Splatter", | |
| category=ToolCategory.SIMULATION, | |
| modalities=["scRNA-seq"], | |
| description="Flexible scRNA-seq data simulation framework", | |
| input_types=["simulation_params"], | |
| output_types=["simulated_count_matrix"], | |
| ), | |
| } | |
| class Modality(str, Enum): | |
| SCRNA_SEQ = "scRNA-seq" | |
| SCATAC_SEQ = "scATAC-seq" | |
| CITE_SEQ = "CITE-seq" | |
| SPATIAL_TRANSCRIPTOMICS = "spatial_transcriptomics" | |
| BULK_RNA_SEQ = "bulk_rna_seq" | |
| SCRNA_MULTIOME = "scMultiome" | |
| PERTURB_SEQ = "Perturb-seq" | |
| CROP_SEQ = "CROP-seq" | |
| SMART_SEQ2 = "Smart-seq2" | |
| SLIDE_SEQ = "Slide-seq" | |
| MERFISH = "MERFISH" | |
| SEQFISH = "seqFISH" | |
| PATCH_SEQ = "Patch-seq" | |
| SHARE_SEQ = "SHARE-seq" | |
| SNARE_SEQ = "SNARE-seq" | |
| SC_HI_C = "scHi-C" | |
| SCBS_SEQ = "scBS-seq" | |
| SCNMT_SEQ = "scNMT-seq" | |
| class ModalitySpec(BaseModel): | |
| """Registry entry for a single-cell or bulk assay modality.""" | |
| name: str | |
| modality: Modality | |
| measurement: str = "" | |
| resolution: str = "single-cell" | |
| multiplexable: bool = False | |
| typical_cells: str = "1k-20k" | |
| typical_cost_per_sample_usd: float = 5000.0 | |
| compatible_tools: List[str] = Field(default_factory=list) | |
| description: str = "" | |
| MODALITY_REGISTRY: Dict[str, ModalitySpec] = { | |
| "scRNA-seq": ModalitySpec( | |
| name="scRNA-seq", | |
| modality=Modality.SCRNA_SEQ, | |
| measurement="mRNA transcripts", | |
| typical_cells="5k-20k", | |
| typical_cost_per_sample_usd=5000.0, | |
| compatible_tools=[ | |
| "CellRanger", "STARsolo", "kallisto_bustools", "Scanpy", "Seurat", | |
| "scVI", "Leiden", "DESeq2", "MAST", "Monocle3", "scVelo", "SCENIC", | |
| "CellChat", "GSEA", "celltypist", "scrublet", | |
| ], | |
| description="Droplet-based single-cell RNA sequencing (e.g. 10x Chromium)", | |
| ), | |
| "scATAC-seq": ModalitySpec( | |
| name="scATAC-seq", | |
| modality=Modality.SCATAC_SEQ, | |
| measurement="open chromatin regions", | |
| typical_cells="5k-15k", | |
| typical_cost_per_sample_usd=6000.0, | |
| compatible_tools=[ | |
| "CellRanger", "ArchR", "Signac", "chromVAR", "Scanpy", "Leiden", | |
| ], | |
| description="Single-cell Assay for Transposase-Accessible Chromatin", | |
| ), | |
| "CITE-seq": ModalitySpec( | |
| name="CITE-seq", | |
| modality=Modality.CITE_SEQ, | |
| measurement="mRNA + surface proteins (ADT)", | |
| multiplexable=True, | |
| typical_cells="5k-20k", | |
| typical_cost_per_sample_usd=8000.0, | |
| compatible_tools=[ | |
| "CellRanger", "Seurat", "WNN", "MOFA+", "Scanpy", "Leiden", | |
| ], | |
| description="Cellular Indexing of Transcriptomes and Epitopes by Sequencing", | |
| ), | |
| "spatial_transcriptomics": ModalitySpec( | |
| name="spatial_transcriptomics", | |
| modality=Modality.SPATIAL_TRANSCRIPTOMICS, | |
| measurement="spatially resolved transcripts", | |
| resolution="spot (55Β΅m) or subcellular", | |
| typical_cells="1k-10k spots", | |
| typical_cost_per_sample_usd=7000.0, | |
| compatible_tools=[ | |
| "spaceranger", "squidpy", "cell2location", "BANKSY", "Scanpy", "Seurat", | |
| ], | |
| description="Spatially resolved transcriptomics (Visium, MERFISH, Slide-seq, etc.)", | |
| ), | |
| "bulk_rna_seq": ModalitySpec( | |
| name="bulk_rna_seq", | |
| modality=Modality.BULK_RNA_SEQ, | |
| measurement="aggregate mRNA across cells", | |
| resolution="bulk", | |
| typical_cells="N/A", | |
| typical_cost_per_sample_usd=500.0, | |
| compatible_tools=["DESeq2", "edgeR", "GSEA", "clusterProfiler"], | |
| description="Standard bulk RNA sequencing", | |
| ), | |
| "scMultiome": ModalitySpec( | |
| name="scMultiome", | |
| modality=Modality.SCRNA_MULTIOME, | |
| measurement="mRNA + open chromatin (joint)", | |
| typical_cells="5k-15k", | |
| typical_cost_per_sample_usd=10000.0, | |
| compatible_tools=[ | |
| "CellRanger", "ArchR", "Signac", "Seurat", "MOFA+", "CellOracle", | |
| ], | |
| description="10x Multiome (joint scRNA + scATAC from same cell)", | |
| ), | |
| "Perturb-seq": ModalitySpec( | |
| name="Perturb-seq", | |
| modality=Modality.PERTURB_SEQ, | |
| measurement="mRNA + CRISPR guide assignment", | |
| multiplexable=True, | |
| typical_cells="10k-100k", | |
| typical_cost_per_sample_usd=15000.0, | |
| compatible_tools=[ | |
| "CellRanger", "Scanpy", "Seurat", "Mixscape", "MIMOSCA", | |
| ], | |
| description="Pooled CRISPR screens with single-cell RNA readout", | |
| ), | |
| "CROP-seq": ModalitySpec( | |
| name="CROP-seq", | |
| modality=Modality.CROP_SEQ, | |
| measurement="mRNA + CRISPR guide assignment", | |
| multiplexable=True, | |
| typical_cells="10k-50k", | |
| typical_cost_per_sample_usd=12000.0, | |
| compatible_tools=[ | |
| "CellRanger", "Scanpy", "Seurat", "Mixscape", "MIMOSCA", | |
| ], | |
| description="CRISPR dropout screen with single-cell RNA readout", | |
| ), | |
| "Smart-seq2": ModalitySpec( | |
| name="Smart-seq2", | |
| modality=Modality.SMART_SEQ2, | |
| measurement="full-length mRNA transcripts", | |
| typical_cells="100-1000", | |
| typical_cost_per_sample_usd=10000.0, | |
| compatible_tools=["Scanpy", "Seurat", "DESeq2", "MAST", "Monocle3"], | |
| description="Plate-based full-length scRNA-seq with high sensitivity", | |
| ), | |
| "MERFISH": ModalitySpec( | |
| name="MERFISH", | |
| modality=Modality.MERFISH, | |
| measurement="in situ mRNA (imaging-based)", | |
| resolution="subcellular", | |
| typical_cells="10k-1M", | |
| typical_cost_per_sample_usd=20000.0, | |
| compatible_tools=["squidpy", "Scanpy", "BANKSY"], | |
| description="Multiplexed Error-Robust FISH for spatial transcriptomics", | |
| ), | |
| "Slide-seq": ModalitySpec( | |
| name="Slide-seq", | |
| modality=Modality.SLIDE_SEQ, | |
| measurement="spatially resolved mRNA (bead array)", | |
| resolution="10Β΅m", | |
| typical_cells="10k-50k beads", | |
| typical_cost_per_sample_usd=8000.0, | |
| compatible_tools=["squidpy", "cell2location", "Scanpy"], | |
| description="Near-cellular spatial transcriptomics on bead arrays", | |
| ), | |
| "Patch-seq": ModalitySpec( | |
| name="Patch-seq", | |
| modality=Modality.PATCH_SEQ, | |
| measurement="mRNA + electrophysiology + morphology", | |
| typical_cells="10-500", | |
| typical_cost_per_sample_usd=50000.0, | |
| compatible_tools=["Scanpy", "Seurat"], | |
| description="Combined patch-clamp electrophysiology and scRNA-seq", | |
| ), | |
| "scHi-C": ModalitySpec( | |
| name="scHi-C", | |
| modality=Modality.SC_HI_C, | |
| measurement="3D chromatin contacts", | |
| typical_cells="1k-10k", | |
| typical_cost_per_sample_usd=15000.0, | |
| compatible_tools=["Scanpy"], | |
| description="Single-cell chromosome conformation capture", | |
| ), | |
| "scBS-seq": ModalitySpec( | |
| name="scBS-seq", | |
| modality=Modality.SCBS_SEQ, | |
| measurement="DNA methylation (CpG)", | |
| typical_cells="100-5k", | |
| typical_cost_per_sample_usd=12000.0, | |
| compatible_tools=["Scanpy"], | |
| description="Single-cell bisulfite sequencing for DNA methylation", | |
| ), | |
| "scNMT-seq": ModalitySpec( | |
| name="scNMT-seq", | |
| modality=Modality.SCNMT_SEQ, | |
| measurement="nucleosome + methylation + transcription (joint)", | |
| typical_cells="100-1k", | |
| typical_cost_per_sample_usd=25000.0, | |
| compatible_tools=["MOFA+", "Scanpy"], | |
| description="Joint single-cell nucleosome, methylation, and transcription", | |
| ), | |
| } | |
| class AssayCategory(str, Enum): | |
| SEQUENCING = "sequencing" | |
| IMAGING = "imaging" | |
| PERTURBATION = "perturbation" | |
| FUNCTIONAL = "functional" | |
| EPIGENOMICS = "epigenomics" | |
| PROTEOMICS = "proteomics" | |
| METABOLOMICS = "metabolomics" | |
| class AssaySpec(BaseModel): | |
| """Registry entry for a laboratory assay or protocol.""" | |
| name: str | |
| category: AssayCategory | |
| modalities: List[str] = Field(default_factory=list) | |
| description: str = "" | |
| typical_duration_days: float = 1.0 | |
| typical_cost_usd: float = 1000.0 | |
| requires_live_cells: bool = False | |
| requires_fresh_tissue: bool = False | |
| throughput: str = "medium" | |
| outputs: List[str] = Field(default_factory=list) | |
| ASSAY_REGISTRY: Dict[str, AssaySpec] = { | |
| "10x_chromium": AssaySpec( | |
| name="10x_chromium", | |
| category=AssayCategory.SEQUENCING, | |
| modalities=["scRNA-seq", "scATAC-seq", "CITE-seq", "scMultiome"], | |
| description="10x Genomics Chromium droplet-based single-cell partitioning", | |
| typical_duration_days=2.0, | |
| typical_cost_usd=5000.0, | |
| requires_live_cells=True, | |
| throughput="high (500-20k cells)", | |
| outputs=["fastq", "count_matrix"], | |
| ), | |
| "smart-seq2": AssaySpec( | |
| name="smart-seq2", | |
| category=AssayCategory.SEQUENCING, | |
| modalities=["Smart-seq2"], | |
| description="Plate-based full-length cDNA scRNA-seq", | |
| typical_duration_days=3.0, | |
| typical_cost_usd=10000.0, | |
| requires_live_cells=True, | |
| throughput="low (96-384 cells)", | |
| outputs=["fastq", "count_matrix"], | |
| ), | |
| "smart-seq3": AssaySpec( | |
| name="smart-seq3", | |
| category=AssayCategory.SEQUENCING, | |
| modalities=["Smart-seq2"], | |
| description="Improved full-length scRNA-seq with UMIs", | |
| typical_duration_days=3.0, | |
| typical_cost_usd=10000.0, | |
| requires_live_cells=True, | |
| throughput="low (96-384 cells)", | |
| outputs=["fastq", "count_matrix"], | |
| ), | |
| "bulk_rna_seq": AssaySpec( | |
| name="bulk_rna_seq", | |
| category=AssayCategory.SEQUENCING, | |
| modalities=["bulk_rna_seq"], | |
| description="Standard bulk RNA sequencing with poly-A or ribo-depletion", | |
| typical_duration_days=3.0, | |
| typical_cost_usd=500.0, | |
| throughput="high", | |
| outputs=["fastq", "count_matrix"], | |
| ), | |
| "atac-seq": AssaySpec( | |
| name="atac-seq", | |
| category=AssayCategory.EPIGENOMICS, | |
| modalities=["scATAC-seq"], | |
| description="Assay for Transposase-Accessible Chromatin using sequencing", | |
| typical_duration_days=2.0, | |
| typical_cost_usd=6000.0, | |
| requires_live_cells=True, | |
| outputs=["fastq", "fragments", "peak_matrix"], | |
| ), | |
| "cite-seq": AssaySpec( | |
| name="cite-seq", | |
| category=AssayCategory.PROTEOMICS, | |
| modalities=["CITE-seq"], | |
| description="Simultaneous RNA + surface protein via DNA-barcoded antibodies", | |
| typical_duration_days=2.0, | |
| typical_cost_usd=8000.0, | |
| requires_live_cells=True, | |
| throughput="high (5k-20k cells)", | |
| outputs=["fastq", "count_matrix", "adt_matrix"], | |
| ), | |
| "10x_multiome": AssaySpec( | |
| name="10x_multiome", | |
| category=AssayCategory.SEQUENCING, | |
| modalities=["scMultiome"], | |
| description="Joint scRNA-seq + scATAC-seq from the same cell", | |
| typical_duration_days=2.0, | |
| typical_cost_usd=10000.0, | |
| requires_live_cells=True, | |
| throughput="high (5k-15k cells)", | |
| outputs=["fastq", "count_matrix", "fragments"], | |
| ), | |
| "visium": AssaySpec( | |
| name="visium", | |
| category=AssayCategory.SEQUENCING, | |
| modalities=["spatial_transcriptomics"], | |
| description="10x Visium spatially barcoded capture on tissue sections", | |
| typical_duration_days=3.0, | |
| typical_cost_usd=7000.0, | |
| requires_fresh_tissue=True, | |
| throughput="medium (1k-5k spots)", | |
| outputs=["fastq", "count_matrix", "spatial_coords", "image"], | |
| ), | |
| "visium_hd": AssaySpec( | |
| name="visium_hd", | |
| category=AssayCategory.SEQUENCING, | |
| modalities=["spatial_transcriptomics"], | |
| description="High-definition Visium with 2Β΅m bin resolution", | |
| typical_duration_days=3.0, | |
| typical_cost_usd=10000.0, | |
| requires_fresh_tissue=True, | |
| throughput="high", | |
| outputs=["fastq", "count_matrix", "spatial_coords", "image"], | |
| ), | |
| "merfish": AssaySpec( | |
| name="merfish", | |
| category=AssayCategory.IMAGING, | |
| modalities=["MERFISH"], | |
| description="Multiplexed Error-Robust FISH imaging-based spatial", | |
| typical_duration_days=5.0, | |
| typical_cost_usd=20000.0, | |
| requires_fresh_tissue=True, | |
| throughput="high (100-1000 genes, millions of transcripts)", | |
| outputs=["transcript_coords", "cell_segmentation"], | |
| ), | |
| "seqfish_plus": AssaySpec( | |
| name="seqfish_plus", | |
| category=AssayCategory.IMAGING, | |
| modalities=["seqFISH"], | |
| description="Sequential FISH for imaging-based spatial transcriptomics", | |
| typical_duration_days=5.0, | |
| typical_cost_usd=15000.0, | |
| requires_fresh_tissue=True, | |
| outputs=["transcript_coords"], | |
| ), | |
| "slide-seq": AssaySpec( | |
| name="slide-seq", | |
| category=AssayCategory.SEQUENCING, | |
| modalities=["Slide-seq"], | |
| description="Near-cellular spatial transcriptomics on bead arrays", | |
| typical_duration_days=3.0, | |
| typical_cost_usd=8000.0, | |
| requires_fresh_tissue=True, | |
| outputs=["count_matrix", "spatial_coords"], | |
| ), | |
| "perturb-seq": AssaySpec( | |
| name="perturb-seq", | |
| category=AssayCategory.PERTURBATION, | |
| modalities=["Perturb-seq"], | |
| description="Pooled CRISPR screen + scRNA-seq readout", | |
| typical_duration_days=14.0, | |
| typical_cost_usd=15000.0, | |
| requires_live_cells=True, | |
| throughput="high (10k-100k cells)", | |
| outputs=["fastq", "count_matrix", "guide_assignments"], | |
| ), | |
| "crop-seq": AssaySpec( | |
| name="crop-seq", | |
| category=AssayCategory.PERTURBATION, | |
| modalities=["CROP-seq"], | |
| description="CRISPR dropout screening with scRNA-seq readout", | |
| typical_duration_days=14.0, | |
| typical_cost_usd=12000.0, | |
| requires_live_cells=True, | |
| throughput="high (10k-50k cells)", | |
| outputs=["fastq", "count_matrix", "guide_assignments"], | |
| ), | |
| "patch-seq": AssaySpec( | |
| name="patch-seq", | |
| category=AssayCategory.FUNCTIONAL, | |
| modalities=["Patch-seq"], | |
| description="Patch-clamp electrophysiology + scRNA-seq on same neuron", | |
| typical_duration_days=7.0, | |
| typical_cost_usd=50000.0, | |
| requires_live_cells=True, | |
| throughput="very low (10-100 cells)", | |
| outputs=["fastq", "count_matrix", "ephys_trace", "morphology"], | |
| ), | |
| "sc_hi_c": AssaySpec( | |
| name="sc_hi_c", | |
| category=AssayCategory.EPIGENOMICS, | |
| modalities=["scHi-C"], | |
| description="Single-cell chromosome conformation capture", | |
| typical_duration_days=5.0, | |
| typical_cost_usd=15000.0, | |
| outputs=["contact_matrix"], | |
| ), | |
| "sc_bisulfite": AssaySpec( | |
| name="sc_bisulfite", | |
| category=AssayCategory.EPIGENOMICS, | |
| modalities=["scBS-seq"], | |
| description="Single-cell bisulfite sequencing for DNA methylation profiling", | |
| typical_duration_days=5.0, | |
| typical_cost_usd=12000.0, | |
| outputs=["methylation_matrix"], | |
| ), | |
| "sc_nmt_seq": AssaySpec( | |
| name="sc_nmt_seq", | |
| category=AssayCategory.EPIGENOMICS, | |
| modalities=["scNMT-seq"], | |
| description="Joint nucleosome occupancy, methylation, and transcription", | |
| typical_duration_days=7.0, | |
| typical_cost_usd=25000.0, | |
| requires_live_cells=True, | |
| throughput="low (100-1k cells)", | |
| outputs=["count_matrix", "methylation_matrix", "accessibility_matrix"], | |
| ), | |
| "flow_cytometry": AssaySpec( | |
| name="flow_cytometry", | |
| category=AssayCategory.FUNCTIONAL, | |
| modalities=[], | |
| description="Fluorescence-based cell sorting and phenotyping", | |
| typical_duration_days=1.0, | |
| typical_cost_usd=500.0, | |
| requires_live_cells=True, | |
| throughput="very high (millions of cells)", | |
| outputs=["cell_counts", "sorted_cells"], | |
| ), | |
| "mass_cytometry_CyTOF": AssaySpec( | |
| name="mass_cytometry_CyTOF", | |
| category=AssayCategory.PROTEOMICS, | |
| modalities=[], | |
| description="Mass-tag cytometry for 40+ protein markers per cell", | |
| typical_duration_days=2.0, | |
| typical_cost_usd=3000.0, | |
| requires_live_cells=True, | |
| throughput="high (100k-1M cells)", | |
| outputs=["protein_expression_matrix"], | |
| ), | |
| "western_blot": AssaySpec( | |
| name="western_blot", | |
| category=AssayCategory.PROTEOMICS, | |
| modalities=[], | |
| description="Protein detection and semi-quantification by size separation", | |
| typical_duration_days=2.0, | |
| typical_cost_usd=200.0, | |
| outputs=["band_image", "relative_quantification"], | |
| ), | |
| "qPCR": AssaySpec( | |
| name="qPCR", | |
| category=AssayCategory.FUNCTIONAL, | |
| modalities=[], | |
| description="Quantitative PCR for targeted gene expression validation", | |
| typical_duration_days=1.0, | |
| typical_cost_usd=100.0, | |
| throughput="low (target genes)", | |
| outputs=["ct_values", "fold_change"], | |
| ), | |
| "immunofluorescence": AssaySpec( | |
| name="immunofluorescence", | |
| category=AssayCategory.IMAGING, | |
| modalities=[], | |
| description="Antibody-based fluorescence imaging of proteins in situ", | |
| typical_duration_days=2.0, | |
| typical_cost_usd=500.0, | |
| outputs=["fluorescence_image"], | |
| ), | |
| "elisa": AssaySpec( | |
| name="elisa", | |
| category=AssayCategory.PROTEOMICS, | |
| modalities=[], | |
| description="Enzyme-linked immunosorbent assay for secreted protein quantification", | |
| typical_duration_days=1.0, | |
| typical_cost_usd=300.0, | |
| throughput="medium (96-384 well)", | |
| outputs=["protein_concentration"], | |
| ), | |
| "cell_viability_assay": AssaySpec( | |
| name="cell_viability_assay", | |
| category=AssayCategory.FUNCTIONAL, | |
| modalities=[], | |
| description="MTT/CellTiter-Glo viability and proliferation measurement", | |
| typical_duration_days=1.0, | |
| typical_cost_usd=200.0, | |
| requires_live_cells=True, | |
| throughput="high (96-384 well)", | |
| outputs=["viability_scores"], | |
| ), | |
| } | |
| # ββ Registry helper functions ββββββββββββββββββββββββββββββββββββββββββββββ | |
| def tools_for_modality(modality: str) -> List[ToolSpec]: | |
| """Return all registered tools compatible with a given modality.""" | |
| return [t for t in TOOL_REGISTRY.values() if modality in t.modalities] | |
| def assays_for_modality(modality: str) -> List[AssaySpec]: | |
| """Return all registered assays that produce a given modality.""" | |
| return [a for a in ASSAY_REGISTRY.values() if modality in a.modalities] | |
| def tools_by_category(category: ToolCategory) -> List[ToolSpec]: | |
| """Return all registered tools in a given category.""" | |
| return [t for t in TOOL_REGISTRY.values() if t.category == category] | |
| # ββ Sub-agents βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class SubagentType(str, Enum): | |
| WET_LAB_PLANNER = "wet_lab_planner" | |
| COMPUTATIONAL_ANALYST = "computational_analyst" | |
| OMICS_QC_AGENT = "omics_qc_agent" | |
| CAUSAL_REASONING_AGENT = "causal_reasoning_agent" | |
| BUDGET_SCHEDULER = "budget_scheduler" | |
| BIOLOGICAL_RULE_CHECKER = "biological_rule_checker" | |
| TOOL_EXECUTOR = "tool_executor" | |
| RETROSPECTIVE_CRITIC = "retrospective_critic" | |
| REPORT_SYNTHESIZER = "report_synthesizer" | |
| # ββ Action schema βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class ExperimentAction(Action): | |
| """Structured, compositional action for one experiment / analysis step. | |
| Hybrid representation: discrete *action_type* plus typed arguments, | |
| optional sub-agent / tool invocation, and calibration fields. | |
| """ | |
| action_type: ActionType = Field( | |
| ..., | |
| description=( | |
| "Discrete simulator step type. The environment enforces scientific " | |
| "prerequisites between steps, so actions should follow a valid " | |
| "pipeline order." | |
| ), | |
| ) | |
| input_targets: List[str] = Field( | |
| default_factory=list, | |
| description=( | |
| "Optional references to prior samples, outputs, or artifacts that " | |
| "this step consumes." | |
| ), | |
| ) | |
| method: Optional[str] = Field( | |
| None, | |
| description=( | |
| "Optional named tool or protocol (for example 'Seurat' or " | |
| "'CellRanger'). Prefer methods compatible with the current " | |
| "modality and available tool list because tool choice can change " | |
| "runtime, cost, and scientific fit." | |
| ), | |
| ) | |
| parameters: Dict[str, Any] = Field( | |
| default_factory=dict, | |
| description=( | |
| "Action-specific settings such as comparison labels, perturbation " | |
| "targets, or analysis options. Use only parameters that materially " | |
| "change the scientific step." | |
| ), | |
| ) | |
| expected_output_type: Optional[str] = Field( | |
| None, | |
| description=( | |
| "Optional expected artifact or summary that should result from the " | |
| "step, such as a count matrix, QC report, DE table, or validation " | |
| "result." | |
| ), | |
| ) | |
| justification: Optional[str] = Field( | |
| None, | |
| description=( | |
| "Short scientific rationale explaining why this is the right next " | |
| "step in the current environment state." | |
| ), | |
| ) | |
| invoked_subagent: Optional[SubagentType] = Field( | |
| None, description="Sub-agent to delegate to, if any" | |
| ) | |
| tool_call_spec: Optional[Dict[str, Any]] = Field( | |
| None, | |
| description=( | |
| "Optional structured tool invocation payload when the action needs " | |
| "a more explicit tool execution plan." | |
| ), | |
| ) | |
| confidence: float = Field( | |
| 0.5, ge=0.0, le=1.0, description="Agent confidence in this step" | |
| ) | |
| # ββ Intermediate outputs ββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class OutputType(str, Enum): | |
| QC_METRICS = "qc_metrics" | |
| COUNT_MATRIX_SUMMARY = "count_matrix_summary" | |
| EMBEDDING_SUMMARY = "embedding_summary" | |
| CLUSTER_RESULT = "cluster_result" | |
| DE_RESULT = "de_result" | |
| PATHWAY_RESULT = "pathway_result" | |
| TRAJECTORY_RESULT = "trajectory_result" | |
| VALIDATION_RESULT = "validation_result" | |
| NETWORK_RESULT = "network_result" | |
| SAMPLE_COLLECTION_RESULT = "sample_collection_result" | |
| LIBRARY_PREP_RESULT = "library_prep_result" | |
| SEQUENCING_RESULT = "sequencing_result" | |
| PERTURBATION_RESULT = "perturbation_result" | |
| CULTURE_RESULT = "culture_result" | |
| COHORT_RESULT = "cohort_result" | |
| FOLLOWUP_DESIGN = "followup_design" | |
| MARKER_RESULT = "marker_result" | |
| FAILURE_REPORT = "failure_report" | |
| SUBAGENT_REPORT = "subagent_report" | |
| CONCLUSION = "conclusion" | |
| class IntermediateOutput(BaseModel): | |
| """A single simulated output from one pipeline step.""" | |
| output_type: OutputType | |
| step_index: int | |
| success: bool = True | |
| quality_score: float = Field(1.0, ge=0.0, le=1.0) | |
| summary: str = "" | |
| data: Dict[str, Any] = Field(default_factory=dict) | |
| uncertainty: float = Field(0.0, ge=0.0, le=1.0) | |
| warnings: List[str] = Field(default_factory=list) | |
| artifacts_available: List[str] = Field(default_factory=list) | |
| # ββ Observable state components βββββββββββββββββββββββββββββββββββββββββββββ | |
| class ResourceUsage(BaseModel): | |
| budget_used: float = 0.0 | |
| budget_remaining: float = 100_000.0 | |
| time_used_days: float = 0.0 | |
| time_remaining_days: float = 180.0 | |
| samples_consumed: int = 0 | |
| compute_hours_used: float = 0.0 | |
| class PipelineStepRecord(BaseModel): | |
| step_index: int | |
| action_type: ActionType | |
| method: Optional[str] = None | |
| parameters: Dict[str, Any] = Field(default_factory=dict) | |
| output_summary: str = "" | |
| output_type: OutputType | |
| success: bool = True | |
| quality_score: float = 1.0 | |
| resource_cost: float = 0.0 | |
| time_cost_days: float = 0.0 | |
| class PaperReference(BaseModel): | |
| """Metadata for a literature source used to ground a task.""" | |
| title: str | |
| citation: Optional[str] = None | |
| doi: Optional[str] = None | |
| pmid: Optional[str] = None | |
| url: Optional[str] = None | |
| class ExpectedFinding(BaseModel): | |
| """A paper-backed result that the agent should try to recover.""" | |
| finding: str | |
| category: str = "claim" | |
| keywords: List[str] = Field(default_factory=list) | |
| class TaskSpec(BaseModel): | |
| """Specification of the biological problem to solve.""" | |
| problem_statement: str = "Unspecified biological problem" | |
| modality: str = "scRNA-seq" | |
| organism: str = "human" | |
| tissue: str = "blood" | |
| conditions: List[str] = Field(default_factory=list) | |
| available_assays: List[str] = Field( | |
| default_factory=lambda: list(ASSAY_REGISTRY.keys()), | |
| description=( | |
| "Assays that are scientifically compatible with this task's " | |
| "modality. These are the relevant assay choices for the episode, " | |
| "not an unrestricted catalog." | |
| ), | |
| ) | |
| available_tools: List[str] = Field( | |
| default_factory=lambda: list(TOOL_REGISTRY.keys()), | |
| description=( | |
| "Tools filtered to those compatible with the current task " | |
| "modality. The agent should treat this list as the preferred tool " | |
| "set for the episode." | |
| ), | |
| ) | |
| budget_limit: float = 100_000.0 | |
| time_limit_days: float = 180.0 | |
| prior_observations: List[str] = Field(default_factory=list) | |
| success_criteria: List[str] = Field(default_factory=list) | |
| dataset_metadata: Dict[str, Any] = Field(default_factory=dict) | |
| paper_references: List[PaperReference] = Field(default_factory=list) | |
| expected_findings: List[ExpectedFinding] = Field(default_factory=list) | |
| class ConclusionClaim(BaseModel): | |
| claim: str = "" | |
| top_markers: List[str] = Field(default_factory=list) | |
| causal_mechanisms: List[str] = Field(default_factory=list) | |
| predicted_pathways: Dict[str, float] = Field(default_factory=dict) | |
| evidence_steps: List[int] = Field(default_factory=list) | |
| confidence: float = Field(0.5, ge=0.0, le=1.0) | |
| claim_type: str = "correlational" | |
| supporting_data: Dict[str, Any] = Field(default_factory=dict) | |
| # ββ Observation schema ββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class ExperimentObservation(Observation): | |
| """Full observable state returned to the agent at each timestep. | |
| Deliberately excludes hidden latent biological truth, hidden failure | |
| conditions, and ground-truth mechanisms. | |
| """ | |
| task: TaskSpec = Field(default_factory=TaskSpec) | |
| step_index: int = 0 | |
| pipeline_history: List[PipelineStepRecord] = Field(default_factory=list) | |
| available_assays: List[str] = Field( | |
| default_factory=list, | |
| description=( | |
| "Episode-specific assay choices already filtered to the current " | |
| "modality and task context." | |
| ), | |
| ) | |
| available_tools: List[str] = Field( | |
| default_factory=list, | |
| description=( | |
| "Episode-specific compatible tools. These are the methods the " | |
| "agent should prefer instead of inventing incompatible tools." | |
| ), | |
| ) | |
| resource_usage: ResourceUsage = Field( | |
| default_factory=ResourceUsage, | |
| description=( | |
| "Running budget, time, and compute usage after previous actions." | |
| ), | |
| ) | |
| latest_output: Optional[IntermediateOutput] = None | |
| all_outputs: List[IntermediateOutput] = Field(default_factory=list) | |
| discovered_markers: List[str] = Field(default_factory=list) | |
| candidate_mechanisms: List[str] = Field(default_factory=list) | |
| uncertainty_summary: Dict[str, float] = Field(default_factory=dict) | |
| subagent_outputs: List[Dict[str, Any]] = Field(default_factory=list) | |
| conclusions: List[ConclusionClaim] = Field(default_factory=list) | |
| rule_violations: List[str] = Field(default_factory=list) | |
| step_reward_breakdown: Dict[str, float] = Field(default_factory=dict) | |
| AGENT_ACTION_GUIDANCE: Dict[ActionType, str] = { | |
| ActionType.COLLECT_SAMPLE: ( | |
| "Wet-lab entry point. One successful collection usually provides enough " | |
| "material to continue unless the output shows poor yield or quality." | |
| ), | |
| ActionType.SELECT_COHORT: ( | |
| "Use when subject stratification is part of the scientific question " | |
| "before downstream experimental work." | |
| ), | |
| ActionType.PREPARE_LIBRARY: ( | |
| "Requires collected samples and converts biological material into " | |
| "sequence-ready libraries." | |
| ), | |
| ActionType.CULTURE_CELLS: ( | |
| "Requires collected samples and adds substantial time; use only when " | |
| "live-cell expansion or later perturbation is needed." | |
| ), | |
| ActionType.PERTURB_GENE: ( | |
| "Requires samples. Use for causal tests, not as a default discovery " | |
| "step." | |
| ), | |
| ActionType.PERTURB_COMPOUND: ( | |
| "Requires samples. Best for mechanistic follow-up or treatment " | |
| "response questions." | |
| ), | |
| ActionType.SEQUENCE_CELLS: ( | |
| "Requires prepared libraries and produces the raw sequencing-derived " | |
| "artifacts used by downstream QC and analysis." | |
| ), | |
| ActionType.RUN_QC: ( | |
| "Requires sequencing and returns summarized quality metrics such as " | |
| "doublets, mitochondrial fraction, and ambient RNA." | |
| ), | |
| ActionType.FILTER_DATA: ( | |
| "Requires QC and removes poor-quality cells, changing downstream cell " | |
| "counts and data retention." | |
| ), | |
| ActionType.NORMALIZE_DATA: ( | |
| "Requires filtered data and unlocks clustering, differential " | |
| "expression, trajectory, and network analyses." | |
| ), | |
| ActionType.INTEGRATE_BATCHES: ( | |
| "Requires normalized data. Use when batch effects are likely to " | |
| "confound interpretation; it is not always necessary." | |
| ), | |
| ActionType.CLUSTER_CELLS: ( | |
| "Requires normalized data and identifies cell populations or states " | |
| "for downstream interpretation." | |
| ), | |
| ActionType.DIFFERENTIAL_EXPRESSION: ( | |
| "Requires normalized data and is the main route to candidate genes " | |
| "for pathway analysis and marker selection." | |
| ), | |
| ActionType.TRAJECTORY_ANALYSIS: ( | |
| "Requires normalized data and is most useful when lineage progression " | |
| "or pseudotime is central to the task." | |
| ), | |
| ActionType.PATHWAY_ENRICHMENT: ( | |
| "Requires differential expression. Results are less reliable without a " | |
| "strong DE gene list." | |
| ), | |
| ActionType.REGULATORY_NETWORK_INFERENCE: ( | |
| "Requires normalized data and is most helpful once cell states or " | |
| "trajectories are already characterized." | |
| ), | |
| ActionType.MARKER_SELECTION: ( | |
| "Requires differential expression and turns candidate genes into a " | |
| "short list for validation." | |
| ), | |
| ActionType.VALIDATE_MARKER: ( | |
| "Requires discovered markers and is an expensive wet-lab confirmation " | |
| "step that should follow strong computational evidence." | |
| ), | |
| ActionType.DESIGN_FOLLOWUP: ( | |
| "Use to propose targeted next experiments once remaining uncertainty " | |
| "is clear." | |
| ), | |
| ActionType.REQUEST_SUBAGENT_REVIEW: ( | |
| "Use for critique or planning support, not as a substitute for " | |
| "missing experimental evidence." | |
| ), | |
| ActionType.SYNTHESIZE_CONCLUSION: ( | |
| "Use once the evidence is sufficient. Do not spend budget on redundant " | |
| "steps just because more actions are possible." | |
| ), | |
| } | |
| AGENT_ENVIRONMENT_RULES: List[str] = [ | |
| ( | |
| "Each successful action already returns summarized scientific evidence, " | |
| "so repeated sampling or repeated analysis is not the default." | |
| ), | |
| ( | |
| "Repeat a step only when the task demands it or when prior outputs show " | |
| "poor quality, insufficient yield, unresolved batch effects, or another " | |
| "clear failure mode." | |
| ), | |
| ( | |
| "The available tool and assay lists are already filtered to the current " | |
| "task modality, so prefer them over inventing incompatible methods." | |
| ), | |
| ( | |
| "Hard scientific prerequisites are enforced by the environment, so " | |
| "invalid pipeline orderings will be blocked." | |
| ), | |
| ] | |
| _TOOL_CATEGORY_AGENT_NOTES: Dict[ToolCategory, str] = { | |
| ToolCategory.ALIGNMENT: ( | |
| "Best immediately after sequencing to turn FASTQ-like inputs into " | |
| "count-style matrices for downstream analysis." | |
| ), | |
| ToolCategory.PREPROCESSING: ( | |
| "Useful for general single-cell data handling before specialized " | |
| "downstream analyses." | |
| ), | |
| ToolCategory.NORMALIZATION: ( | |
| "Applies after filtering to produce normalized matrices for downstream " | |
| "modeling." | |
| ), | |
| ToolCategory.DIMENSIONALITY_REDUCTION: ( | |
| "Builds latent embeddings that support clustering or trajectory work." | |
| ), | |
| ToolCategory.CLUSTERING: ( | |
| "Best once data are normalized and the goal is to resolve cell states " | |
| "or populations." | |
| ), | |
| ToolCategory.DIFFERENTIAL_EXPRESSION: ( | |
| "Tests contrasts and produces ranked genes for biological " | |
| "interpretation." | |
| ), | |
| ToolCategory.TRAJECTORY: ( | |
| "Useful when the task asks about developmental progression, state " | |
| "transitions, or pseudotime." | |
| ), | |
| ToolCategory.GENE_REGULATORY_NETWORK: ( | |
| "Most useful after normalized data and some cell-state structure are " | |
| "already established." | |
| ), | |
| ToolCategory.GENE_SET_ANALYSIS: ( | |
| "Best after differential expression to interpret gene lists at the " | |
| "pathway level." | |
| ), | |
| ToolCategory.BATCH_CORRECTION: ( | |
| "Use when batch effects would confound interpretation; unnecessary use " | |
| "adds extra steps." | |
| ), | |
| ToolCategory.MULTIMODAL_INTEGRATION: ( | |
| "Useful only when combining modalities or batches is part of the " | |
| "scientific question." | |
| ), | |
| ToolCategory.QUALITY_CONTROL: ( | |
| "Helps identify low-quality cells or technical artifacts before " | |
| "filtering." | |
| ), | |
| ToolCategory.CELL_TYPE_ANNOTATION: ( | |
| "Best after clustering when assigning biological identities to groups." | |
| ), | |
| ToolCategory.PERTURBATION_ANALYSIS: ( | |
| "Use when perturbations were actually applied and the goal is to model " | |
| "their transcriptional effects." | |
| ), | |
| ToolCategory.SPATIAL: ( | |
| "Only useful when the modality includes spatial coordinates or tissue " | |
| "context." | |
| ), | |
| } | |
| def _format_currency(value: float) -> str: | |
| return f"${value:,.0f}" | |
| def _format_runtime_hours(hours: float) -> str: | |
| if hours < 1.0: | |
| return f"{int(round(hours * 60))}m" | |
| if float(hours).is_integer(): | |
| return f"{int(hours)}h" | |
| return f"{hours:.1f}h" | |
| def describe_tool_for_agent(tool_name: str) -> str: | |
| """Return a compact environment-aware tool description for prompts.""" | |
| tool = TOOL_REGISTRY.get(tool_name) | |
| if tool is None: | |
| return tool_name | |
| parts = [f"{tool.name}: {tool.description}."] | |
| if tool.input_types or tool.output_types: | |
| inputs = ", ".join(tool.input_types) or "upstream artifacts" | |
| outputs = ", ".join(tool.output_types) or "analysis artifacts" | |
| parts.append(f"Consumes {inputs}; yields {outputs}.") | |
| category_note = _TOOL_CATEGORY_AGENT_NOTES.get(tool.category) | |
| if category_note: | |
| parts.append(category_note) | |
| resource_bits: List[str] = [] | |
| if tool.typical_cost_usd > 0: | |
| resource_bits.append(_format_currency(tool.typical_cost_usd)) | |
| if tool.typical_runtime_hours > 0: | |
| resource_bits.append(_format_runtime_hours(tool.typical_runtime_hours)) | |
| if tool.requires_gpu: | |
| resource_bits.append("GPU") | |
| if resource_bits: | |
| parts.append(f"Typical resources: {', '.join(resource_bits)}.") | |
| return " ".join(parts) | |
| def describe_assay_for_agent(assay_name: str) -> str: | |
| """Return a compact environment-aware assay description for prompts.""" | |
| assay = ASSAY_REGISTRY.get(assay_name) | |
| if assay is None: | |
| return assay_name | |
| parts = [f"{assay.name}: {assay.description}."] | |
| if assay.outputs: | |
| parts.append(f"Produces {', '.join(assay.outputs)}.") | |
| requirements: List[str] = [] | |
| if assay.requires_live_cells: | |
| requirements.append("live cells") | |
| if assay.requires_fresh_tissue: | |
| requirements.append("fresh tissue") | |
| if requirements: | |
| parts.append(f"Requires {' and '.join(requirements)}.") | |
| parts.append( | |
| "Typical resources: " | |
| f"{_format_currency(assay.typical_cost_usd)}, " | |
| f"{assay.typical_duration_days:.1f}d." | |
| ) | |
| return " ".join(parts) | |
| def build_agent_system_prompt() -> str: | |
| """Build the shared agent system prompt for training and inference.""" | |
| lines = [ | |
| "You are an expert biologist planning a single-cell experiment pipeline.", | |
| "", | |
| "At each turn you see the experiment state and must pick the next scientifically justified step.", | |
| "", | |
| "Environment-specific reasoning rules:", | |
| ] | |
| lines.extend(f" - {rule}" for rule in AGENT_ENVIRONMENT_RULES) | |
| lines.append("") | |
| lines.append("Action guidance:") | |
| lines.extend( | |
| f" - {action_type.value}: {AGENT_ACTION_GUIDANCE[action_type]}" | |
| for action_type in ActionType | |
| ) | |
| lines.extend([ | |
| "", | |
| "Respond with ONLY valid JSON, nothing else:", | |
| '{"action_type": "...", "method": null, "parameters": {}, "justification": "...", "confidence": 0.8}', | |
| "", | |
| "For synthesize_conclusion, use structured claims:", | |
| '{"action_type": "synthesize_conclusion", "parameters": {"claims": [{"top_markers": ["GENE1", "GENE2"], "causal_mechanisms": ["mechanism description"], "predicted_pathways": {"pathway_name": 0.8}, "confidence": 0.8, "claim_type": "causal", "claim": "optional free text"}]}, "justification": "...", "confidence": 0.8}', | |
| ]) | |
| return "\n".join(lines) | |
| def build_agent_observation_context( | |
| obs: ExperimentObservation, | |
| *, | |
| max_tools: int = 6, | |
| max_assays: int = 3, | |
| ) -> str: | |
| """Summarize modality-specific tool and assay context for the agent.""" | |
| sections: List[str] = [] | |
| modality_spec = MODALITY_REGISTRY.get(obs.task.modality) | |
| if modality_spec is not None: | |
| sections.append( | |
| "Modality context: " | |
| f"{modality_spec.name} measures {modality_spec.measurement} at " | |
| f"{modality_spec.resolution} resolution; typical scale " | |
| f"{modality_spec.typical_cells}." | |
| ) | |
| else: | |
| sections.append(f"Modality context: {obs.task.modality}.") | |
| tool_names = list(dict.fromkeys(obs.available_tools or obs.task.available_tools)) | |
| if tool_names: | |
| sections.append("Available tools (already filtered to this modality):") | |
| for tool_name in tool_names[:max_tools]: | |
| sections.append(f" - {describe_tool_for_agent(tool_name)}") | |
| if len(tool_names) > max_tools: | |
| remainder = ", ".join(tool_names[max_tools:max_tools + 6]) | |
| sections.append( | |
| " - Additional compatible tools not shown in full: " | |
| f"{remainder}" | |
| ) | |
| assay_names = list(dict.fromkeys(obs.available_assays or obs.task.available_assays)) | |
| if assay_names: | |
| sections.append("Available assays:") | |
| for assay_name in assay_names[:max_assays]: | |
| sections.append(f" - {describe_assay_for_agent(assay_name)}") | |
| if len(assay_names) > max_assays: | |
| remainder = ", ".join(assay_names[max_assays:max_assays + 4]) | |
| sections.append( | |
| " - Additional compatible assays not shown in full: " | |
| f"{remainder}" | |
| ) | |
| return "\n".join(sections) | |