| | """
|
| | Data models for the Bio-Experiment Planning RL Environment.
|
| |
|
| | Defines the POMDP action and observation contracts for a scientific agent
|
| | that constructs biological experiment pipelines step-by-step.
|
| | """
|
| |
|
| | from __future__ import annotations
|
| |
|
| | from enum import Enum
|
| | from typing import Any, Dict, List, Optional
|
| |
|
| | from pydantic import BaseModel, Field
|
| |
|
| | from openenv.core.env_server.types import Action, Observation
|
| |
|
| |
|
| |
|
| |
|
| |
|
| | class ActionType(str, Enum):
|
| | COLLECT_SAMPLE = "collect_sample"
|
| | SELECT_COHORT = "select_cohort"
|
| | PREPARE_LIBRARY = "prepare_library"
|
| | CULTURE_CELLS = "culture_cells"
|
| | PERTURB_GENE = "perturb_gene"
|
| | PERTURB_COMPOUND = "perturb_compound"
|
| | SEQUENCE_CELLS = "sequence_cells"
|
| | RUN_QC = "run_qc"
|
| | FILTER_DATA = "filter_data"
|
| | NORMALIZE_DATA = "normalize_data"
|
| | INTEGRATE_BATCHES = "integrate_batches"
|
| | CLUSTER_CELLS = "cluster_cells"
|
| | DIFFERENTIAL_EXPRESSION = "differential_expression"
|
| | TRAJECTORY_ANALYSIS = "trajectory_analysis"
|
| | PATHWAY_ENRICHMENT = "pathway_enrichment"
|
| | REGULATORY_NETWORK_INFERENCE = "regulatory_network_inference"
|
| | MARKER_SELECTION = "marker_selection"
|
| | VALIDATE_MARKER = "validate_marker"
|
| | DESIGN_FOLLOWUP = "design_followup_experiment"
|
| | REQUEST_SUBAGENT_REVIEW = "request_subagent_review"
|
| | SYNTHESIZE_CONCLUSION = "synthesize_conclusion"
|
| |
|
| |
|
| | WET_LAB_ACTIONS = frozenset({
|
| | ActionType.COLLECT_SAMPLE,
|
| | ActionType.SELECT_COHORT,
|
| | ActionType.PREPARE_LIBRARY,
|
| | ActionType.CULTURE_CELLS,
|
| | ActionType.PERTURB_GENE,
|
| | ActionType.PERTURB_COMPOUND,
|
| | ActionType.SEQUENCE_CELLS,
|
| | ActionType.VALIDATE_MARKER,
|
| | })
|
| |
|
| | COMPUTATIONAL_ACTIONS = frozenset({
|
| | ActionType.RUN_QC,
|
| | ActionType.FILTER_DATA,
|
| | ActionType.NORMALIZE_DATA,
|
| | ActionType.INTEGRATE_BATCHES,
|
| | ActionType.CLUSTER_CELLS,
|
| | ActionType.DIFFERENTIAL_EXPRESSION,
|
| | ActionType.TRAJECTORY_ANALYSIS,
|
| | ActionType.PATHWAY_ENRICHMENT,
|
| | ActionType.REGULATORY_NETWORK_INFERENCE,
|
| | ActionType.MARKER_SELECTION,
|
| | })
|
| |
|
| | META_ACTIONS = frozenset({
|
| | ActionType.DESIGN_FOLLOWUP,
|
| | ActionType.REQUEST_SUBAGENT_REVIEW,
|
| | ActionType.SYNTHESIZE_CONCLUSION,
|
| | })
|
| |
|
| |
|
| |
|
| |
|
| |
|
| | class ToolCategory(str, Enum):
|
| | ALIGNMENT = "alignment"
|
| | PREPROCESSING = "preprocessing"
|
| | NORMALIZATION = "normalization"
|
| | DIMENSIONALITY_REDUCTION = "dimensionality_reduction"
|
| | CLUSTERING = "clustering"
|
| | DIFFERENTIAL_EXPRESSION = "differential_expression"
|
| | TRAJECTORY = "trajectory"
|
| | GENE_REGULATORY_NETWORK = "gene_regulatory_network"
|
| | CELL_COMMUNICATION = "cell_communication"
|
| | SPATIAL = "spatial"
|
| | MULTIMODAL_INTEGRATION = "multimodal_integration"
|
| | GENE_SET_ANALYSIS = "gene_set_analysis"
|
| | VARIANT_CALLING = "variant_calling"
|
| | PEAK_CALLING = "peak_calling"
|
| | IMPUTATION = "imputation"
|
| | BATCH_CORRECTION = "batch_correction"
|
| | CELL_TYPE_ANNOTATION = "cell_type_annotation"
|
| | SIMULATION = "simulation"
|
| | VISUALIZATION = "visualization"
|
| | QUALITY_CONTROL = "quality_control"
|
| | PERTURBATION_ANALYSIS = "perturbation_analysis"
|
| |
|
| |
|
| | class ToolSpec(BaseModel):
|
| | """Registry entry describing a bioinformatics tool."""
|
| |
|
| | name: str
|
| | category: ToolCategory
|
| | modalities: List[str] = Field(default_factory=list)
|
| | description: str = ""
|
| | input_types: List[str] = Field(default_factory=list)
|
| | output_types: List[str] = Field(default_factory=list)
|
| | typical_runtime_hours: float = 0.1
|
| | typical_cost_usd: float = 0.0
|
| | requires_gpu: bool = False
|
| | open_source: bool = True
|
| |
|
| |
|
| | TOOL_REGISTRY: Dict[str, ToolSpec] = {
|
| |
|
| | "CellRanger": ToolSpec(
|
| | name="CellRanger",
|
| | category=ToolCategory.ALIGNMENT,
|
| | modalities=["scRNA-seq", "scATAC-seq", "CITE-seq", "scMultiome"],
|
| | description="10x Genomics pipeline for alignment, barcode demux, and counting",
|
| | input_types=["fastq"],
|
| | output_types=["count_matrix", "bam"],
|
| | typical_runtime_hours=4.0,
|
| | open_source=False,
|
| | ),
|
| | "STARsolo": ToolSpec(
|
| | name="STARsolo",
|
| | category=ToolCategory.ALIGNMENT,
|
| | modalities=["scRNA-seq", "scATAC-seq"],
|
| | description="Drop-seq / 10x-compatible aligner built into STAR",
|
| | input_types=["fastq"],
|
| | output_types=["count_matrix", "bam"],
|
| | typical_runtime_hours=3.0,
|
| | ),
|
| | "kallisto_bustools": ToolSpec(
|
| | name="kallisto_bustools",
|
| | category=ToolCategory.ALIGNMENT,
|
| | modalities=["scRNA-seq"],
|
| | description="Pseudoalignment-based lightweight quantification",
|
| | input_types=["fastq"],
|
| | output_types=["count_matrix"],
|
| | typical_runtime_hours=1.0,
|
| | ),
|
| | "Salmon_alevin": ToolSpec(
|
| | name="Salmon_alevin",
|
| | category=ToolCategory.ALIGNMENT,
|
| | modalities=["scRNA-seq"],
|
| | description="Quasi-mapping quantification for single-cell RNA-seq",
|
| | input_types=["fastq"],
|
| | output_types=["count_matrix"],
|
| | typical_runtime_hours=1.5,
|
| | ),
|
| | "spaceranger": ToolSpec(
|
| | name="spaceranger",
|
| | category=ToolCategory.ALIGNMENT,
|
| | modalities=["spatial_transcriptomics"],
|
| | description="10x Visium spatial alignment and quantification",
|
| | input_types=["fastq", "image"],
|
| | output_types=["count_matrix", "spatial_coords"],
|
| | typical_runtime_hours=3.0,
|
| | open_source=False,
|
| | ),
|
| |
|
| | "Scanpy": ToolSpec(
|
| | name="Scanpy",
|
| | category=ToolCategory.PREPROCESSING,
|
| | modalities=["scRNA-seq", "scATAC-seq", "spatial_transcriptomics"],
|
| | description="Python single-cell analysis framework",
|
| | input_types=["count_matrix", "h5ad"],
|
| | output_types=["h5ad", "embedding", "cluster_result"],
|
| | typical_runtime_hours=0.5,
|
| | ),
|
| | "Seurat": ToolSpec(
|
| | name="Seurat",
|
| | category=ToolCategory.PREPROCESSING,
|
| | modalities=["scRNA-seq", "CITE-seq", "spatial_transcriptomics", "scATAC-seq"],
|
| | description="R single-cell analysis toolkit with multimodal support",
|
| | input_types=["count_matrix", "h5seurat"],
|
| | output_types=["h5seurat", "embedding", "cluster_result"],
|
| | typical_runtime_hours=0.5,
|
| | ),
|
| | "Bioconductor_SingleCellExperiment": ToolSpec(
|
| | name="Bioconductor_SingleCellExperiment",
|
| | category=ToolCategory.PREPROCESSING,
|
| | modalities=["scRNA-seq"],
|
| | description="R/Bioconductor framework for single-cell experiments",
|
| | input_types=["count_matrix"],
|
| | output_types=["sce_object"],
|
| | typical_runtime_hours=0.3,
|
| | ),
|
| |
|
| | "scran": ToolSpec(
|
| | name="scran",
|
| | category=ToolCategory.NORMALIZATION,
|
| | modalities=["scRNA-seq"],
|
| | description="Pool-based size-factor normalization",
|
| | input_types=["count_matrix"],
|
| | output_types=["normalized_matrix"],
|
| | ),
|
| | "sctransform": ToolSpec(
|
| | name="sctransform",
|
| | category=ToolCategory.NORMALIZATION,
|
| | modalities=["scRNA-seq"],
|
| | description="Variance-stabilizing transformation via regularized NB regression",
|
| | input_types=["count_matrix"],
|
| | output_types=["normalized_matrix"],
|
| | ),
|
| |
|
| | "scVI": ToolSpec(
|
| | name="scVI",
|
| | category=ToolCategory.DIMENSIONALITY_REDUCTION,
|
| | modalities=["scRNA-seq", "CITE-seq", "scATAC-seq"],
|
| | description="Deep generative model for scRNA-seq (variational inference)",
|
| | input_types=["count_matrix"],
|
| | output_types=["latent_embedding"],
|
| | requires_gpu=True,
|
| | ),
|
| | "UMAP": ToolSpec(
|
| | name="UMAP",
|
| | category=ToolCategory.DIMENSIONALITY_REDUCTION,
|
| | modalities=["scRNA-seq", "scATAC-seq", "CITE-seq", "spatial_transcriptomics"],
|
| | description="Uniform manifold approximation for 2D/3D visualization",
|
| | input_types=["pca_embedding", "latent_embedding"],
|
| | output_types=["2d_embedding"],
|
| | ),
|
| |
|
| | "Leiden": ToolSpec(
|
| | name="Leiden",
|
| | category=ToolCategory.CLUSTERING,
|
| | modalities=["scRNA-seq", "scATAC-seq", "CITE-seq"],
|
| | description="Community detection via the Leiden algorithm",
|
| | input_types=["knn_graph"],
|
| | output_types=["cluster_result"],
|
| | ),
|
| | "Louvain": ToolSpec(
|
| | name="Louvain",
|
| | category=ToolCategory.CLUSTERING,
|
| | modalities=["scRNA-seq", "scATAC-seq"],
|
| | description="Community detection via Louvain modularity optimization",
|
| | input_types=["knn_graph"],
|
| | output_types=["cluster_result"],
|
| | ),
|
| |
|
| | "DESeq2": ToolSpec(
|
| | name="DESeq2",
|
| | category=ToolCategory.DIFFERENTIAL_EXPRESSION,
|
| | modalities=["bulk_rna_seq", "scRNA-seq"],
|
| | description="Negative binomial GLM-based differential expression",
|
| | input_types=["count_matrix"],
|
| | output_types=["de_result"],
|
| | ),
|
| | "MAST": ToolSpec(
|
| | name="MAST",
|
| | category=ToolCategory.DIFFERENTIAL_EXPRESSION,
|
| | modalities=["scRNA-seq"],
|
| | description="Two-part hurdle model for scRNA-seq DE testing",
|
| | input_types=["count_matrix"],
|
| | output_types=["de_result"],
|
| | ),
|
| | "edgeR": ToolSpec(
|
| | name="edgeR",
|
| | category=ToolCategory.DIFFERENTIAL_EXPRESSION,
|
| | modalities=["bulk_rna_seq", "scRNA-seq"],
|
| | description="Empirical Bayes quasi-likelihood DE testing",
|
| | input_types=["count_matrix"],
|
| | output_types=["de_result"],
|
| | ),
|
| | "Wilcoxon": ToolSpec(
|
| | name="Wilcoxon",
|
| | category=ToolCategory.DIFFERENTIAL_EXPRESSION,
|
| | modalities=["scRNA-seq"],
|
| | description="Rank-sum test for marker gene detection",
|
| | input_types=["count_matrix"],
|
| | output_types=["de_result"],
|
| | ),
|
| |
|
| | "Monocle3": ToolSpec(
|
| | name="Monocle3",
|
| | category=ToolCategory.TRAJECTORY,
|
| | modalities=["scRNA-seq"],
|
| | description="Reversed graph embedding for pseudotime trajectories",
|
| | input_types=["count_matrix", "embedding"],
|
| | output_types=["trajectory_result", "pseudotime"],
|
| | ),
|
| | "scVelo": ToolSpec(
|
| | name="scVelo",
|
| | category=ToolCategory.TRAJECTORY,
|
| | modalities=["scRNA-seq"],
|
| | description="RNA velocity estimation via spliced/unspliced dynamics",
|
| | input_types=["count_matrix"],
|
| | output_types=["velocity_result"],
|
| | ),
|
| | "CellRank": ToolSpec(
|
| | name="CellRank",
|
| | category=ToolCategory.TRAJECTORY,
|
| | modalities=["scRNA-seq"],
|
| | description="Fate probability estimation combining velocity and transcriptomics",
|
| | input_types=["velocity_result", "count_matrix"],
|
| | output_types=["fate_probabilities"],
|
| | ),
|
| | "Slingshot": ToolSpec(
|
| | name="Slingshot",
|
| | category=ToolCategory.TRAJECTORY,
|
| | modalities=["scRNA-seq"],
|
| | description="Minimum spanning tree-based trajectory inference",
|
| | input_types=["embedding", "cluster_result"],
|
| | output_types=["trajectory_result", "pseudotime"],
|
| | ),
|
| | "PAGA": ToolSpec(
|
| | name="PAGA",
|
| | category=ToolCategory.TRAJECTORY,
|
| | modalities=["scRNA-seq"],
|
| | description="Partition-based graph abstraction for topology estimation",
|
| | input_types=["knn_graph", "cluster_result"],
|
| | output_types=["trajectory_result"],
|
| | ),
|
| |
|
| | "SCENIC": ToolSpec(
|
| | name="SCENIC",
|
| | category=ToolCategory.GENE_REGULATORY_NETWORK,
|
| | modalities=["scRNA-seq"],
|
| | description="Single-cell regulatory network inference and clustering",
|
| | input_types=["count_matrix"],
|
| | output_types=["regulon_result", "network_result"],
|
| | typical_runtime_hours=6.0,
|
| | ),
|
| | "CellOracle": ToolSpec(
|
| | name="CellOracle",
|
| | category=ToolCategory.GENE_REGULATORY_NETWORK,
|
| | modalities=["scRNA-seq", "scATAC-seq", "scMultiome"],
|
| | description="GRN-based in-silico perturbation prediction",
|
| | input_types=["count_matrix", "peak_matrix"],
|
| | output_types=["network_result", "perturbation_prediction"],
|
| | typical_runtime_hours=4.0,
|
| | ),
|
| |
|
| | "CellChat": ToolSpec(
|
| | name="CellChat",
|
| | category=ToolCategory.CELL_COMMUNICATION,
|
| | modalities=["scRNA-seq", "spatial_transcriptomics"],
|
| | description="Ligand-receptor interaction inference with communication patterns",
|
| | input_types=["count_matrix", "cluster_result"],
|
| | output_types=["communication_result"],
|
| | ),
|
| | "NicheNet": ToolSpec(
|
| | name="NicheNet",
|
| | category=ToolCategory.CELL_COMMUNICATION,
|
| | modalities=["scRNA-seq"],
|
| | description="Ligand-target link prediction using prior knowledge",
|
| | input_types=["count_matrix", "de_result"],
|
| | output_types=["communication_result"],
|
| | ),
|
| | "LIANA": ToolSpec(
|
| | name="LIANA",
|
| | category=ToolCategory.CELL_COMMUNICATION,
|
| | modalities=["scRNA-seq", "spatial_transcriptomics"],
|
| | description="Framework unifying multiple ligand-receptor methods",
|
| | input_types=["count_matrix", "cluster_result"],
|
| | output_types=["communication_result"],
|
| | ),
|
| |
|
| | "squidpy": ToolSpec(
|
| | name="squidpy",
|
| | category=ToolCategory.SPATIAL,
|
| | modalities=["spatial_transcriptomics"],
|
| | description="Spatial omics analysis (neighborhood, co-occurrence, image features)",
|
| | input_types=["count_matrix", "spatial_coords"],
|
| | output_types=["spatial_result"],
|
| | ),
|
| | "cell2location": ToolSpec(
|
| | name="cell2location",
|
| | category=ToolCategory.SPATIAL,
|
| | modalities=["spatial_transcriptomics"],
|
| | description="Spatial deconvolution mapping cell types to tissue locations",
|
| | input_types=["count_matrix", "spatial_coords", "reference_h5ad"],
|
| | output_types=["deconvolution_result"],
|
| | requires_gpu=True,
|
| | ),
|
| | "BANKSY": ToolSpec(
|
| | name="BANKSY",
|
| | category=ToolCategory.SPATIAL,
|
| | modalities=["spatial_transcriptomics"],
|
| | description="Spatially-aware clustering combining cell and neighbor features",
|
| | input_types=["count_matrix", "spatial_coords"],
|
| | output_types=["cluster_result"],
|
| | ),
|
| |
|
| | "Harmony": ToolSpec(
|
| | name="Harmony",
|
| | category=ToolCategory.BATCH_CORRECTION,
|
| | modalities=["scRNA-seq", "scATAC-seq", "CITE-seq"],
|
| | description="Fast iterative batch correction on PCA embeddings",
|
| | input_types=["pca_embedding"],
|
| | output_types=["corrected_embedding"],
|
| | ),
|
| | "scanorama": ToolSpec(
|
| | name="scanorama",
|
| | category=ToolCategory.BATCH_CORRECTION,
|
| | modalities=["scRNA-seq"],
|
| | description="Panoramic stitching of scRNA-seq batches",
|
| | input_types=["count_matrix"],
|
| | output_types=["corrected_embedding", "corrected_matrix"],
|
| | ),
|
| | "BBKNN": ToolSpec(
|
| | name="BBKNN",
|
| | category=ToolCategory.BATCH_CORRECTION,
|
| | modalities=["scRNA-seq"],
|
| | description="Batch-balanced KNN graph construction",
|
| | input_types=["pca_embedding"],
|
| | output_types=["knn_graph"],
|
| | ),
|
| | "WNN": ToolSpec(
|
| | name="WNN",
|
| | category=ToolCategory.MULTIMODAL_INTEGRATION,
|
| | modalities=["CITE-seq", "scMultiome"],
|
| | description="Weighted nearest neighbors for multimodal integration (Seurat v4+)",
|
| | input_types=["rna_embedding", "protein_embedding"],
|
| | output_types=["multimodal_embedding"],
|
| | ),
|
| | "MOFA+": ToolSpec(
|
| | name="MOFA+",
|
| | category=ToolCategory.MULTIMODAL_INTEGRATION,
|
| | modalities=["scMultiome", "CITE-seq"],
|
| | description="Multi-omics factor analysis for unsupervised integration",
|
| | input_types=["count_matrix", "peak_matrix"],
|
| | output_types=["factor_result"],
|
| | ),
|
| | "ArchR": ToolSpec(
|
| | name="ArchR",
|
| | category=ToolCategory.PREPROCESSING,
|
| | modalities=["scATAC-seq", "scMultiome"],
|
| | description="Full-featured scATAC-seq analysis framework in R",
|
| | input_types=["fragments", "bam"],
|
| | output_types=["peak_matrix", "gene_activity_matrix"],
|
| | typical_runtime_hours=2.0,
|
| | ),
|
| | "Signac": ToolSpec(
|
| | name="Signac",
|
| | category=ToolCategory.PREPROCESSING,
|
| | modalities=["scATAC-seq", "scMultiome"],
|
| | description="Seurat extension for chromatin accessibility analysis",
|
| | input_types=["fragments", "peak_matrix"],
|
| | output_types=["peak_matrix", "motif_result"],
|
| | ),
|
| | "chromVAR": ToolSpec(
|
| | name="chromVAR",
|
| | category=ToolCategory.PEAK_CALLING,
|
| | modalities=["scATAC-seq", "scMultiome"],
|
| | description="TF motif accessibility deviation scoring",
|
| | input_types=["peak_matrix"],
|
| | output_types=["motif_deviation_scores"],
|
| | ),
|
| |
|
| | "GSEA": ToolSpec(
|
| | name="GSEA",
|
| | category=ToolCategory.GENE_SET_ANALYSIS,
|
| | modalities=["bulk_rna_seq", "scRNA-seq"],
|
| | description="Gene Set Enrichment Analysis (preranked or phenotype-based)",
|
| | input_types=["de_result", "ranked_gene_list"],
|
| | output_types=["pathway_result"],
|
| | ),
|
| | "clusterProfiler": ToolSpec(
|
| | name="clusterProfiler",
|
| | category=ToolCategory.GENE_SET_ANALYSIS,
|
| | modalities=["bulk_rna_seq", "scRNA-seq"],
|
| | description="ORA & GSEA with GO, KEGG, Reactome, and custom gene sets",
|
| | input_types=["de_result", "gene_list"],
|
| | output_types=["pathway_result"],
|
| | ),
|
| | "decoupleR": ToolSpec(
|
| | name="decoupleR",
|
| | category=ToolCategory.GENE_SET_ANALYSIS,
|
| | modalities=["scRNA-seq", "bulk_rna_seq", "spatial_transcriptomics"],
|
| | description="Unified framework for functional activity inference (TF, pathway)",
|
| | input_types=["count_matrix", "de_result"],
|
| | output_types=["activity_scores"],
|
| | ),
|
| |
|
| | "celltypist": ToolSpec(
|
| | name="celltypist",
|
| | category=ToolCategory.CELL_TYPE_ANNOTATION,
|
| | modalities=["scRNA-seq"],
|
| | description="Automated cell type classification with pre-trained models",
|
| | input_types=["count_matrix"],
|
| | output_types=["annotation_result"],
|
| | ),
|
| | "SingleR": ToolSpec(
|
| | name="SingleR",
|
| | category=ToolCategory.CELL_TYPE_ANNOTATION,
|
| | modalities=["scRNA-seq"],
|
| | description="Reference-based cell type annotation using correlation",
|
| | input_types=["count_matrix", "reference_dataset"],
|
| | output_types=["annotation_result"],
|
| | ),
|
| | "scArches": ToolSpec(
|
| | name="scArches",
|
| | category=ToolCategory.CELL_TYPE_ANNOTATION,
|
| | modalities=["scRNA-seq", "scATAC-seq", "CITE-seq"],
|
| | description="Reference mapping and label transfer via deep learning",
|
| | input_types=["count_matrix", "reference_model"],
|
| | output_types=["annotation_result", "latent_embedding"],
|
| | requires_gpu=True,
|
| | ),
|
| |
|
| | "MAGIC": ToolSpec(
|
| | name="MAGIC",
|
| | category=ToolCategory.IMPUTATION,
|
| | modalities=["scRNA-seq"],
|
| | description="Markov affinity-based graph imputation of dropout zeros",
|
| | input_types=["count_matrix"],
|
| | output_types=["imputed_matrix"],
|
| | ),
|
| |
|
| | "MILO": ToolSpec(
|
| | name="MILO",
|
| | category=ToolCategory.PERTURBATION_ANALYSIS,
|
| | modalities=["scRNA-seq"],
|
| | description="Differential abundance testing on KNN graph neighborhoods",
|
| | input_types=["count_matrix", "knn_graph"],
|
| | output_types=["da_result"],
|
| | ),
|
| | "Mixscape": ToolSpec(
|
| | name="Mixscape",
|
| | category=ToolCategory.PERTURBATION_ANALYSIS,
|
| | modalities=["Perturb-seq", "CROP-seq"],
|
| | description="Seurat extension for CRISPR screen perturbation analysis",
|
| | input_types=["count_matrix", "guide_assignments"],
|
| | output_types=["perturbation_result"],
|
| | ),
|
| | "MIMOSCA": ToolSpec(
|
| | name="MIMOSCA",
|
| | category=ToolCategory.PERTURBATION_ANALYSIS,
|
| | modalities=["Perturb-seq", "CROP-seq"],
|
| | description="Multi-input multi-output single-cell analysis for screens",
|
| | input_types=["count_matrix", "guide_assignments"],
|
| | output_types=["perturbation_result"],
|
| | ),
|
| |
|
| | "scrublet": ToolSpec(
|
| | name="scrublet",
|
| | category=ToolCategory.QUALITY_CONTROL,
|
| | modalities=["scRNA-seq"],
|
| | description="Computational doublet detection via synthetic doublets",
|
| | input_types=["count_matrix"],
|
| | output_types=["doublet_scores"],
|
| | ),
|
| | "DoubletFinder": ToolSpec(
|
| | name="DoubletFinder",
|
| | category=ToolCategory.QUALITY_CONTROL,
|
| | modalities=["scRNA-seq"],
|
| | description="Artificial nearest-neighbor doublet detection",
|
| | input_types=["count_matrix"],
|
| | output_types=["doublet_scores"],
|
| | ),
|
| | "SoupX": ToolSpec(
|
| | name="SoupX",
|
| | category=ToolCategory.QUALITY_CONTROL,
|
| | modalities=["scRNA-seq"],
|
| | description="Ambient RNA contamination estimation and removal",
|
| | input_types=["count_matrix", "raw_count_matrix"],
|
| | output_types=["corrected_matrix"],
|
| | ),
|
| | "DecontX": ToolSpec(
|
| | name="DecontX",
|
| | category=ToolCategory.QUALITY_CONTROL,
|
| | modalities=["scRNA-seq"],
|
| | description="Bayesian ambient RNA decontamination",
|
| | input_types=["count_matrix"],
|
| | output_types=["corrected_matrix"],
|
| | ),
|
| |
|
| | "Splatter": ToolSpec(
|
| | name="Splatter",
|
| | category=ToolCategory.SIMULATION,
|
| | modalities=["scRNA-seq"],
|
| | description="Flexible scRNA-seq data simulation framework",
|
| | input_types=["simulation_params"],
|
| | output_types=["simulated_count_matrix"],
|
| | ),
|
| | }
|
| |
|
| |
|
| | class Modality(str, Enum):
|
| | SCRNA_SEQ = "scRNA-seq"
|
| | SCATAC_SEQ = "scATAC-seq"
|
| | CITE_SEQ = "CITE-seq"
|
| | SPATIAL_TRANSCRIPTOMICS = "spatial_transcriptomics"
|
| | BULK_RNA_SEQ = "bulk_rna_seq"
|
| | SCRNA_MULTIOME = "scMultiome"
|
| | PERTURB_SEQ = "Perturb-seq"
|
| | CROP_SEQ = "CROP-seq"
|
| | SMART_SEQ2 = "Smart-seq2"
|
| | SLIDE_SEQ = "Slide-seq"
|
| | MERFISH = "MERFISH"
|
| | SEQFISH = "seqFISH"
|
| | PATCH_SEQ = "Patch-seq"
|
| | SHARE_SEQ = "SHARE-seq"
|
| | SNARE_SEQ = "SNARE-seq"
|
| | SC_HI_C = "scHi-C"
|
| | SCBS_SEQ = "scBS-seq"
|
| | SCNMT_SEQ = "scNMT-seq"
|
| |
|
| |
|
| | class ModalitySpec(BaseModel):
|
| | """Registry entry for a single-cell or bulk assay modality."""
|
| |
|
| | name: str
|
| | modality: Modality
|
| | measurement: str = ""
|
| | resolution: str = "single-cell"
|
| | multiplexable: bool = False
|
| | typical_cells: str = "1k-20k"
|
| | typical_cost_per_sample_usd: float = 5000.0
|
| | compatible_tools: List[str] = Field(default_factory=list)
|
| | description: str = ""
|
| |
|
| |
|
| | MODALITY_REGISTRY: Dict[str, ModalitySpec] = {
|
| | "scRNA-seq": ModalitySpec(
|
| | name="scRNA-seq",
|
| | modality=Modality.SCRNA_SEQ,
|
| | measurement="mRNA transcripts",
|
| | typical_cells="5k-20k",
|
| | typical_cost_per_sample_usd=5000.0,
|
| | compatible_tools=[
|
| | "CellRanger", "STARsolo", "kallisto_bustools", "Scanpy", "Seurat",
|
| | "scVI", "Leiden", "DESeq2", "MAST", "Monocle3", "scVelo", "SCENIC",
|
| | "CellChat", "GSEA", "celltypist", "scrublet",
|
| | ],
|
| | description="Droplet-based single-cell RNA sequencing (e.g. 10x Chromium)",
|
| | ),
|
| | "scATAC-seq": ModalitySpec(
|
| | name="scATAC-seq",
|
| | modality=Modality.SCATAC_SEQ,
|
| | measurement="open chromatin regions",
|
| | typical_cells="5k-15k",
|
| | typical_cost_per_sample_usd=6000.0,
|
| | compatible_tools=[
|
| | "CellRanger", "ArchR", "Signac", "chromVAR", "Scanpy", "Leiden",
|
| | ],
|
| | description="Single-cell Assay for Transposase-Accessible Chromatin",
|
| | ),
|
| | "CITE-seq": ModalitySpec(
|
| | name="CITE-seq",
|
| | modality=Modality.CITE_SEQ,
|
| | measurement="mRNA + surface proteins (ADT)",
|
| | multiplexable=True,
|
| | typical_cells="5k-20k",
|
| | typical_cost_per_sample_usd=8000.0,
|
| | compatible_tools=[
|
| | "CellRanger", "Seurat", "WNN", "MOFA+", "Scanpy", "Leiden",
|
| | ],
|
| | description="Cellular Indexing of Transcriptomes and Epitopes by Sequencing",
|
| | ),
|
| | "spatial_transcriptomics": ModalitySpec(
|
| | name="spatial_transcriptomics",
|
| | modality=Modality.SPATIAL_TRANSCRIPTOMICS,
|
| | measurement="spatially resolved transcripts",
|
| | resolution="spot (55Β΅m) or subcellular",
|
| | typical_cells="1k-10k spots",
|
| | typical_cost_per_sample_usd=7000.0,
|
| | compatible_tools=[
|
| | "spaceranger", "squidpy", "cell2location", "BANKSY", "Scanpy", "Seurat",
|
| | ],
|
| | description="Spatially resolved transcriptomics (Visium, MERFISH, Slide-seq, etc.)",
|
| | ),
|
| | "bulk_rna_seq": ModalitySpec(
|
| | name="bulk_rna_seq",
|
| | modality=Modality.BULK_RNA_SEQ,
|
| | measurement="aggregate mRNA across cells",
|
| | resolution="bulk",
|
| | typical_cells="N/A",
|
| | typical_cost_per_sample_usd=500.0,
|
| | compatible_tools=["DESeq2", "edgeR", "GSEA", "clusterProfiler"],
|
| | description="Standard bulk RNA sequencing",
|
| | ),
|
| | "scMultiome": ModalitySpec(
|
| | name="scMultiome",
|
| | modality=Modality.SCRNA_MULTIOME,
|
| | measurement="mRNA + open chromatin (joint)",
|
| | typical_cells="5k-15k",
|
| | typical_cost_per_sample_usd=10000.0,
|
| | compatible_tools=[
|
| | "CellRanger", "ArchR", "Signac", "Seurat", "MOFA+", "CellOracle",
|
| | ],
|
| | description="10x Multiome (joint scRNA + scATAC from same cell)",
|
| | ),
|
| | "Perturb-seq": ModalitySpec(
|
| | name="Perturb-seq",
|
| | modality=Modality.PERTURB_SEQ,
|
| | measurement="mRNA + CRISPR guide assignment",
|
| | multiplexable=True,
|
| | typical_cells="10k-100k",
|
| | typical_cost_per_sample_usd=15000.0,
|
| | compatible_tools=[
|
| | "CellRanger", "Scanpy", "Seurat", "Mixscape", "MIMOSCA",
|
| | ],
|
| | description="Pooled CRISPR screens with single-cell RNA readout",
|
| | ),
|
| | "CROP-seq": ModalitySpec(
|
| | name="CROP-seq",
|
| | modality=Modality.CROP_SEQ,
|
| | measurement="mRNA + CRISPR guide assignment",
|
| | multiplexable=True,
|
| | typical_cells="10k-50k",
|
| | typical_cost_per_sample_usd=12000.0,
|
| | compatible_tools=[
|
| | "CellRanger", "Scanpy", "Seurat", "Mixscape", "MIMOSCA",
|
| | ],
|
| | description="CRISPR dropout screen with single-cell RNA readout",
|
| | ),
|
| | "Smart-seq2": ModalitySpec(
|
| | name="Smart-seq2",
|
| | modality=Modality.SMART_SEQ2,
|
| | measurement="full-length mRNA transcripts",
|
| | typical_cells="100-1000",
|
| | typical_cost_per_sample_usd=10000.0,
|
| | compatible_tools=["Scanpy", "Seurat", "DESeq2", "MAST", "Monocle3"],
|
| | description="Plate-based full-length scRNA-seq with high sensitivity",
|
| | ),
|
| | "MERFISH": ModalitySpec(
|
| | name="MERFISH",
|
| | modality=Modality.MERFISH,
|
| | measurement="in situ mRNA (imaging-based)",
|
| | resolution="subcellular",
|
| | typical_cells="10k-1M",
|
| | typical_cost_per_sample_usd=20000.0,
|
| | compatible_tools=["squidpy", "Scanpy", "BANKSY"],
|
| | description="Multiplexed Error-Robust FISH for spatial transcriptomics",
|
| | ),
|
| | "Slide-seq": ModalitySpec(
|
| | name="Slide-seq",
|
| | modality=Modality.SLIDE_SEQ,
|
| | measurement="spatially resolved mRNA (bead array)",
|
| | resolution="10Β΅m",
|
| | typical_cells="10k-50k beads",
|
| | typical_cost_per_sample_usd=8000.0,
|
| | compatible_tools=["squidpy", "cell2location", "Scanpy"],
|
| | description="Near-cellular spatial transcriptomics on bead arrays",
|
| | ),
|
| | "Patch-seq": ModalitySpec(
|
| | name="Patch-seq",
|
| | modality=Modality.PATCH_SEQ,
|
| | measurement="mRNA + electrophysiology + morphology",
|
| | typical_cells="10-500",
|
| | typical_cost_per_sample_usd=50000.0,
|
| | compatible_tools=["Scanpy", "Seurat"],
|
| | description="Combined patch-clamp electrophysiology and scRNA-seq",
|
| | ),
|
| | "scHi-C": ModalitySpec(
|
| | name="scHi-C",
|
| | modality=Modality.SC_HI_C,
|
| | measurement="3D chromatin contacts",
|
| | typical_cells="1k-10k",
|
| | typical_cost_per_sample_usd=15000.0,
|
| | compatible_tools=["Scanpy"],
|
| | description="Single-cell chromosome conformation capture",
|
| | ),
|
| | "scBS-seq": ModalitySpec(
|
| | name="scBS-seq",
|
| | modality=Modality.SCBS_SEQ,
|
| | measurement="DNA methylation (CpG)",
|
| | typical_cells="100-5k",
|
| | typical_cost_per_sample_usd=12000.0,
|
| | compatible_tools=["Scanpy"],
|
| | description="Single-cell bisulfite sequencing for DNA methylation",
|
| | ),
|
| | "scNMT-seq": ModalitySpec(
|
| | name="scNMT-seq",
|
| | modality=Modality.SCNMT_SEQ,
|
| | measurement="nucleosome + methylation + transcription (joint)",
|
| | typical_cells="100-1k",
|
| | typical_cost_per_sample_usd=25000.0,
|
| | compatible_tools=["MOFA+", "Scanpy"],
|
| | description="Joint single-cell nucleosome, methylation, and transcription",
|
| | ),
|
| | }
|
| |
|
| |
|
| | class AssayCategory(str, Enum):
|
| | SEQUENCING = "sequencing"
|
| | IMAGING = "imaging"
|
| | PERTURBATION = "perturbation"
|
| | FUNCTIONAL = "functional"
|
| | EPIGENOMICS = "epigenomics"
|
| | PROTEOMICS = "proteomics"
|
| | METABOLOMICS = "metabolomics"
|
| |
|
| |
|
| | class AssaySpec(BaseModel):
|
| | """Registry entry for a laboratory assay or protocol."""
|
| |
|
| | name: str
|
| | category: AssayCategory
|
| | modalities: List[str] = Field(default_factory=list)
|
| | description: str = ""
|
| | typical_duration_days: float = 1.0
|
| | typical_cost_usd: float = 1000.0
|
| | requires_live_cells: bool = False
|
| | requires_fresh_tissue: bool = False
|
| | throughput: str = "medium"
|
| | outputs: List[str] = Field(default_factory=list)
|
| |
|
| |
|
| | ASSAY_REGISTRY: Dict[str, AssaySpec] = {
|
| | "10x_chromium": AssaySpec(
|
| | name="10x_chromium",
|
| | category=AssayCategory.SEQUENCING,
|
| | modalities=["scRNA-seq", "scATAC-seq", "CITE-seq", "scMultiome"],
|
| | description="10x Genomics Chromium droplet-based single-cell partitioning",
|
| | typical_duration_days=2.0,
|
| | typical_cost_usd=5000.0,
|
| | requires_live_cells=True,
|
| | throughput="high (500-20k cells)",
|
| | outputs=["fastq", "count_matrix"],
|
| | ),
|
| | "smart-seq2": AssaySpec(
|
| | name="smart-seq2",
|
| | category=AssayCategory.SEQUENCING,
|
| | modalities=["Smart-seq2"],
|
| | description="Plate-based full-length cDNA scRNA-seq",
|
| | typical_duration_days=3.0,
|
| | typical_cost_usd=10000.0,
|
| | requires_live_cells=True,
|
| | throughput="low (96-384 cells)",
|
| | outputs=["fastq", "count_matrix"],
|
| | ),
|
| | "smart-seq3": AssaySpec(
|
| | name="smart-seq3",
|
| | category=AssayCategory.SEQUENCING,
|
| | modalities=["Smart-seq2"],
|
| | description="Improved full-length scRNA-seq with UMIs",
|
| | typical_duration_days=3.0,
|
| | typical_cost_usd=10000.0,
|
| | requires_live_cells=True,
|
| | throughput="low (96-384 cells)",
|
| | outputs=["fastq", "count_matrix"],
|
| | ),
|
| | "bulk_rna_seq": AssaySpec(
|
| | name="bulk_rna_seq",
|
| | category=AssayCategory.SEQUENCING,
|
| | modalities=["bulk_rna_seq"],
|
| | description="Standard bulk RNA sequencing with poly-A or ribo-depletion",
|
| | typical_duration_days=3.0,
|
| | typical_cost_usd=500.0,
|
| | throughput="high",
|
| | outputs=["fastq", "count_matrix"],
|
| | ),
|
| | "atac-seq": AssaySpec(
|
| | name="atac-seq",
|
| | category=AssayCategory.EPIGENOMICS,
|
| | modalities=["scATAC-seq"],
|
| | description="Assay for Transposase-Accessible Chromatin using sequencing",
|
| | typical_duration_days=2.0,
|
| | typical_cost_usd=6000.0,
|
| | requires_live_cells=True,
|
| | outputs=["fastq", "fragments", "peak_matrix"],
|
| | ),
|
| | "cite-seq": AssaySpec(
|
| | name="cite-seq",
|
| | category=AssayCategory.PROTEOMICS,
|
| | modalities=["CITE-seq"],
|
| | description="Simultaneous RNA + surface protein via DNA-barcoded antibodies",
|
| | typical_duration_days=2.0,
|
| | typical_cost_usd=8000.0,
|
| | requires_live_cells=True,
|
| | throughput="high (5k-20k cells)",
|
| | outputs=["fastq", "count_matrix", "adt_matrix"],
|
| | ),
|
| | "10x_multiome": AssaySpec(
|
| | name="10x_multiome",
|
| | category=AssayCategory.SEQUENCING,
|
| | modalities=["scMultiome"],
|
| | description="Joint scRNA-seq + scATAC-seq from the same cell",
|
| | typical_duration_days=2.0,
|
| | typical_cost_usd=10000.0,
|
| | requires_live_cells=True,
|
| | throughput="high (5k-15k cells)",
|
| | outputs=["fastq", "count_matrix", "fragments"],
|
| | ),
|
| | "visium": AssaySpec(
|
| | name="visium",
|
| | category=AssayCategory.SEQUENCING,
|
| | modalities=["spatial_transcriptomics"],
|
| | description="10x Visium spatially barcoded capture on tissue sections",
|
| | typical_duration_days=3.0,
|
| | typical_cost_usd=7000.0,
|
| | requires_fresh_tissue=True,
|
| | throughput="medium (1k-5k spots)",
|
| | outputs=["fastq", "count_matrix", "spatial_coords", "image"],
|
| | ),
|
| | "visium_hd": AssaySpec(
|
| | name="visium_hd",
|
| | category=AssayCategory.SEQUENCING,
|
| | modalities=["spatial_transcriptomics"],
|
| | description="High-definition Visium with 2Β΅m bin resolution",
|
| | typical_duration_days=3.0,
|
| | typical_cost_usd=10000.0,
|
| | requires_fresh_tissue=True,
|
| | throughput="high",
|
| | outputs=["fastq", "count_matrix", "spatial_coords", "image"],
|
| | ),
|
| | "merfish": AssaySpec(
|
| | name="merfish",
|
| | category=AssayCategory.IMAGING,
|
| | modalities=["MERFISH"],
|
| | description="Multiplexed Error-Robust FISH imaging-based spatial",
|
| | typical_duration_days=5.0,
|
| | typical_cost_usd=20000.0,
|
| | requires_fresh_tissue=True,
|
| | throughput="high (100-1000 genes, millions of transcripts)",
|
| | outputs=["transcript_coords", "cell_segmentation"],
|
| | ),
|
| | "seqfish_plus": AssaySpec(
|
| | name="seqfish_plus",
|
| | category=AssayCategory.IMAGING,
|
| | modalities=["seqFISH"],
|
| | description="Sequential FISH for imaging-based spatial transcriptomics",
|
| | typical_duration_days=5.0,
|
| | typical_cost_usd=15000.0,
|
| | requires_fresh_tissue=True,
|
| | outputs=["transcript_coords"],
|
| | ),
|
| | "slide-seq": AssaySpec(
|
| | name="slide-seq",
|
| | category=AssayCategory.SEQUENCING,
|
| | modalities=["Slide-seq"],
|
| | description="Near-cellular spatial transcriptomics on bead arrays",
|
| | typical_duration_days=3.0,
|
| | typical_cost_usd=8000.0,
|
| | requires_fresh_tissue=True,
|
| | outputs=["count_matrix", "spatial_coords"],
|
| | ),
|
| | "perturb-seq": AssaySpec(
|
| | name="perturb-seq",
|
| | category=AssayCategory.PERTURBATION,
|
| | modalities=["Perturb-seq"],
|
| | description="Pooled CRISPR screen + scRNA-seq readout",
|
| | typical_duration_days=14.0,
|
| | typical_cost_usd=15000.0,
|
| | requires_live_cells=True,
|
| | throughput="high (10k-100k cells)",
|
| | outputs=["fastq", "count_matrix", "guide_assignments"],
|
| | ),
|
| | "crop-seq": AssaySpec(
|
| | name="crop-seq",
|
| | category=AssayCategory.PERTURBATION,
|
| | modalities=["CROP-seq"],
|
| | description="CRISPR dropout screening with scRNA-seq readout",
|
| | typical_duration_days=14.0,
|
| | typical_cost_usd=12000.0,
|
| | requires_live_cells=True,
|
| | throughput="high (10k-50k cells)",
|
| | outputs=["fastq", "count_matrix", "guide_assignments"],
|
| | ),
|
| | "patch-seq": AssaySpec(
|
| | name="patch-seq",
|
| | category=AssayCategory.FUNCTIONAL,
|
| | modalities=["Patch-seq"],
|
| | description="Patch-clamp electrophysiology + scRNA-seq on same neuron",
|
| | typical_duration_days=7.0,
|
| | typical_cost_usd=50000.0,
|
| | requires_live_cells=True,
|
| | throughput="very low (10-100 cells)",
|
| | outputs=["fastq", "count_matrix", "ephys_trace", "morphology"],
|
| | ),
|
| | "sc_hi_c": AssaySpec(
|
| | name="sc_hi_c",
|
| | category=AssayCategory.EPIGENOMICS,
|
| | modalities=["scHi-C"],
|
| | description="Single-cell chromosome conformation capture",
|
| | typical_duration_days=5.0,
|
| | typical_cost_usd=15000.0,
|
| | outputs=["contact_matrix"],
|
| | ),
|
| | "sc_bisulfite": AssaySpec(
|
| | name="sc_bisulfite",
|
| | category=AssayCategory.EPIGENOMICS,
|
| | modalities=["scBS-seq"],
|
| | description="Single-cell bisulfite sequencing for DNA methylation profiling",
|
| | typical_duration_days=5.0,
|
| | typical_cost_usd=12000.0,
|
| | outputs=["methylation_matrix"],
|
| | ),
|
| | "sc_nmt_seq": AssaySpec(
|
| | name="sc_nmt_seq",
|
| | category=AssayCategory.EPIGENOMICS,
|
| | modalities=["scNMT-seq"],
|
| | description="Joint nucleosome occupancy, methylation, and transcription",
|
| | typical_duration_days=7.0,
|
| | typical_cost_usd=25000.0,
|
| | requires_live_cells=True,
|
| | throughput="low (100-1k cells)",
|
| | outputs=["count_matrix", "methylation_matrix", "accessibility_matrix"],
|
| | ),
|
| | "flow_cytometry": AssaySpec(
|
| | name="flow_cytometry",
|
| | category=AssayCategory.FUNCTIONAL,
|
| | modalities=[],
|
| | description="Fluorescence-based cell sorting and phenotyping",
|
| | typical_duration_days=1.0,
|
| | typical_cost_usd=500.0,
|
| | requires_live_cells=True,
|
| | throughput="very high (millions of cells)",
|
| | outputs=["cell_counts", "sorted_cells"],
|
| | ),
|
| | "mass_cytometry_CyTOF": AssaySpec(
|
| | name="mass_cytometry_CyTOF",
|
| | category=AssayCategory.PROTEOMICS,
|
| | modalities=[],
|
| | description="Mass-tag cytometry for 40+ protein markers per cell",
|
| | typical_duration_days=2.0,
|
| | typical_cost_usd=3000.0,
|
| | requires_live_cells=True,
|
| | throughput="high (100k-1M cells)",
|
| | outputs=["protein_expression_matrix"],
|
| | ),
|
| | "western_blot": AssaySpec(
|
| | name="western_blot",
|
| | category=AssayCategory.PROTEOMICS,
|
| | modalities=[],
|
| | description="Protein detection and semi-quantification by size separation",
|
| | typical_duration_days=2.0,
|
| | typical_cost_usd=200.0,
|
| | outputs=["band_image", "relative_quantification"],
|
| | ),
|
| | "qPCR": AssaySpec(
|
| | name="qPCR",
|
| | category=AssayCategory.FUNCTIONAL,
|
| | modalities=[],
|
| | description="Quantitative PCR for targeted gene expression validation",
|
| | typical_duration_days=1.0,
|
| | typical_cost_usd=100.0,
|
| | throughput="low (target genes)",
|
| | outputs=["ct_values", "fold_change"],
|
| | ),
|
| | "immunofluorescence": AssaySpec(
|
| | name="immunofluorescence",
|
| | category=AssayCategory.IMAGING,
|
| | modalities=[],
|
| | description="Antibody-based fluorescence imaging of proteins in situ",
|
| | typical_duration_days=2.0,
|
| | typical_cost_usd=500.0,
|
| | outputs=["fluorescence_image"],
|
| | ),
|
| | "elisa": AssaySpec(
|
| | name="elisa",
|
| | category=AssayCategory.PROTEOMICS,
|
| | modalities=[],
|
| | description="Enzyme-linked immunosorbent assay for secreted protein quantification",
|
| | typical_duration_days=1.0,
|
| | typical_cost_usd=300.0,
|
| | throughput="medium (96-384 well)",
|
| | outputs=["protein_concentration"],
|
| | ),
|
| | "cell_viability_assay": AssaySpec(
|
| | name="cell_viability_assay",
|
| | category=AssayCategory.FUNCTIONAL,
|
| | modalities=[],
|
| | description="MTT/CellTiter-Glo viability and proliferation measurement",
|
| | typical_duration_days=1.0,
|
| | typical_cost_usd=200.0,
|
| | requires_live_cells=True,
|
| | throughput="high (96-384 well)",
|
| | outputs=["viability_scores"],
|
| | ),
|
| | }
|
| |
|
| |
|
| |
|
| |
|
| |
|
| | def tools_for_modality(modality: str) -> List[ToolSpec]:
|
| | """Return all registered tools compatible with a given modality."""
|
| | return [t for t in TOOL_REGISTRY.values() if modality in t.modalities]
|
| |
|
| |
|
| | def assays_for_modality(modality: str) -> List[AssaySpec]:
|
| | """Return all registered assays that produce a given modality."""
|
| | return [a for a in ASSAY_REGISTRY.values() if modality in a.modalities]
|
| |
|
| |
|
| | def tools_by_category(category: ToolCategory) -> List[ToolSpec]:
|
| | """Return all registered tools in a given category."""
|
| | return [t for t in TOOL_REGISTRY.values() if t.category == category]
|
| |
|
| |
|
| |
|
| |
|
| |
|
| | class SubagentType(str, Enum):
|
| | WET_LAB_PLANNER = "wet_lab_planner"
|
| | COMPUTATIONAL_ANALYST = "computational_analyst"
|
| | OMICS_QC_AGENT = "omics_qc_agent"
|
| | CAUSAL_REASONING_AGENT = "causal_reasoning_agent"
|
| | BUDGET_SCHEDULER = "budget_scheduler"
|
| | BIOLOGICAL_RULE_CHECKER = "biological_rule_checker"
|
| | TOOL_EXECUTOR = "tool_executor"
|
| | RETROSPECTIVE_CRITIC = "retrospective_critic"
|
| | REPORT_SYNTHESIZER = "report_synthesizer"
|
| |
|
| |
|
| |
|
| |
|
| |
|
| | class ExperimentAction(Action):
|
| | """Structured, compositional action for one experiment / analysis step.
|
| |
|
| | Hybrid representation: discrete *action_type* plus typed arguments,
|
| | optional sub-agent / tool invocation, and calibration fields.
|
| | """
|
| |
|
| | action_type: ActionType = Field(
|
| | ...,
|
| | description=(
|
| | "Discrete simulator step type. The environment enforces scientific "
|
| | "prerequisites between steps, so actions should follow a valid "
|
| | "pipeline order."
|
| | ),
|
| | )
|
| | input_targets: List[str] = Field(
|
| | default_factory=list,
|
| | description=(
|
| | "Optional references to prior samples, outputs, or artifacts that "
|
| | "this step consumes."
|
| | ),
|
| | )
|
| | method: Optional[str] = Field(
|
| | None,
|
| | description=(
|
| | "Optional named tool or protocol (for example 'Seurat' or "
|
| | "'CellRanger'). Prefer methods compatible with the current "
|
| | "modality and available tool list because tool choice can change "
|
| | "runtime, cost, and scientific fit."
|
| | ),
|
| | )
|
| | parameters: Dict[str, Any] = Field(
|
| | default_factory=dict,
|
| | description=(
|
| | "Action-specific settings such as comparison labels, perturbation "
|
| | "targets, or analysis options. Use only parameters that materially "
|
| | "change the scientific step."
|
| | ),
|
| | )
|
| | expected_output_type: Optional[str] = Field(
|
| | None,
|
| | description=(
|
| | "Optional expected artifact or summary that should result from the "
|
| | "step, such as a count matrix, QC report, DE table, or validation "
|
| | "result."
|
| | ),
|
| | )
|
| | justification: Optional[str] = Field(
|
| | None,
|
| | description=(
|
| | "Short scientific rationale explaining why this is the right next "
|
| | "step in the current environment state."
|
| | ),
|
| | )
|
| | invoked_subagent: Optional[SubagentType] = Field(
|
| | None, description="Sub-agent to delegate to, if any"
|
| | )
|
| | tool_call_spec: Optional[Dict[str, Any]] = Field(
|
| | None,
|
| | description=(
|
| | "Optional structured tool invocation payload when the action needs "
|
| | "a more explicit tool execution plan."
|
| | ),
|
| | )
|
| | confidence: float = Field(
|
| | 0.5, ge=0.0, le=1.0, description="Agent confidence in this step"
|
| | )
|
| |
|
| |
|
| |
|
| |
|
| |
|
| | class OutputType(str, Enum):
|
| | QC_METRICS = "qc_metrics"
|
| | COUNT_MATRIX_SUMMARY = "count_matrix_summary"
|
| | EMBEDDING_SUMMARY = "embedding_summary"
|
| | CLUSTER_RESULT = "cluster_result"
|
| | DE_RESULT = "de_result"
|
| | PATHWAY_RESULT = "pathway_result"
|
| | TRAJECTORY_RESULT = "trajectory_result"
|
| | VALIDATION_RESULT = "validation_result"
|
| | NETWORK_RESULT = "network_result"
|
| | SAMPLE_COLLECTION_RESULT = "sample_collection_result"
|
| | LIBRARY_PREP_RESULT = "library_prep_result"
|
| | SEQUENCING_RESULT = "sequencing_result"
|
| | PERTURBATION_RESULT = "perturbation_result"
|
| | CULTURE_RESULT = "culture_result"
|
| | COHORT_RESULT = "cohort_result"
|
| | FOLLOWUP_DESIGN = "followup_design"
|
| | MARKER_RESULT = "marker_result"
|
| | FAILURE_REPORT = "failure_report"
|
| | SUBAGENT_REPORT = "subagent_report"
|
| | CONCLUSION = "conclusion"
|
| |
|
| |
|
| | class IntermediateOutput(BaseModel):
|
| | """A single simulated output from one pipeline step."""
|
| |
|
| | output_type: OutputType
|
| | step_index: int
|
| | success: bool = True
|
| | quality_score: float = Field(1.0, ge=0.0, le=1.0)
|
| | summary: str = ""
|
| | data: Dict[str, Any] = Field(default_factory=dict)
|
| | uncertainty: float = Field(0.0, ge=0.0, le=1.0)
|
| | warnings: List[str] = Field(default_factory=list)
|
| | artifacts_available: List[str] = Field(default_factory=list)
|
| |
|
| |
|
| |
|
| |
|
| |
|
| | class ResourceUsage(BaseModel):
|
| | budget_used: float = 0.0
|
| | budget_remaining: float = 100_000.0
|
| | time_used_days: float = 0.0
|
| | time_remaining_days: float = 180.0
|
| | samples_consumed: int = 0
|
| | compute_hours_used: float = 0.0
|
| |
|
| |
|
| | class PipelineStepRecord(BaseModel):
|
| | step_index: int
|
| | action_type: ActionType
|
| | method: Optional[str] = None
|
| | parameters: Dict[str, Any] = Field(default_factory=dict)
|
| | output_summary: str = ""
|
| | output_type: OutputType
|
| | success: bool = True
|
| | quality_score: float = 1.0
|
| | resource_cost: float = 0.0
|
| | time_cost_days: float = 0.0
|
| |
|
| |
|
| | class PaperReference(BaseModel):
|
| | """Metadata for a literature source used to ground a task."""
|
| |
|
| | title: str
|
| | citation: Optional[str] = None
|
| | doi: Optional[str] = None
|
| | pmid: Optional[str] = None
|
| | url: Optional[str] = None
|
| |
|
| |
|
| | class ExpectedFinding(BaseModel):
|
| | """A paper-backed result that the agent should try to recover."""
|
| |
|
| | finding: str
|
| | category: str = "claim"
|
| | keywords: List[str] = Field(default_factory=list)
|
| |
|
| |
|
| | class TaskSpec(BaseModel):
|
| | """Specification of the biological problem to solve."""
|
| |
|
| | problem_statement: str = "Unspecified biological problem"
|
| | modality: str = "scRNA-seq"
|
| | organism: str = "human"
|
| | tissue: str = "blood"
|
| | conditions: List[str] = Field(default_factory=list)
|
| | available_assays: List[str] = Field(
|
| | default_factory=lambda: list(ASSAY_REGISTRY.keys()),
|
| | description=(
|
| | "Assays that are scientifically compatible with this task's "
|
| | "modality. These are the relevant assay choices for the episode, "
|
| | "not an unrestricted catalog."
|
| | ),
|
| | )
|
| | available_tools: List[str] = Field(
|
| | default_factory=lambda: list(TOOL_REGISTRY.keys()),
|
| | description=(
|
| | "Tools filtered to those compatible with the current task "
|
| | "modality. The agent should treat this list as the preferred tool "
|
| | "set for the episode."
|
| | ),
|
| | )
|
| | budget_limit: float = 100_000.0
|
| | time_limit_days: float = 180.0
|
| | prior_observations: List[str] = Field(default_factory=list)
|
| | success_criteria: List[str] = Field(default_factory=list)
|
| | dataset_metadata: Dict[str, Any] = Field(default_factory=dict)
|
| | paper_references: List[PaperReference] = Field(default_factory=list)
|
| | expected_findings: List[ExpectedFinding] = Field(default_factory=list)
|
| |
|
| |
|
| | class ConclusionClaim(BaseModel):
|
| | claim: str = ""
|
| | top_markers: List[str] = Field(default_factory=list)
|
| | causal_mechanisms: List[str] = Field(default_factory=list)
|
| | predicted_pathways: Dict[str, float] = Field(default_factory=dict)
|
| | evidence_steps: List[int] = Field(default_factory=list)
|
| | confidence: float = Field(0.5, ge=0.0, le=1.0)
|
| | claim_type: str = "correlational"
|
| | supporting_data: Dict[str, Any] = Field(default_factory=dict)
|
| |
|
| |
|
| |
|
| |
|
| |
|
| | class ExperimentObservation(Observation):
|
| | """Full observable state returned to the agent at each timestep.
|
| |
|
| | Deliberately excludes hidden latent biological truth, hidden failure
|
| | conditions, and ground-truth mechanisms.
|
| | """
|
| |
|
| | task: TaskSpec = Field(default_factory=TaskSpec)
|
| | step_index: int = 0
|
| | pipeline_history: List[PipelineStepRecord] = Field(default_factory=list)
|
| | available_assays: List[str] = Field(
|
| | default_factory=list,
|
| | description=(
|
| | "Episode-specific assay choices already filtered to the current "
|
| | "modality and task context."
|
| | ),
|
| | )
|
| | available_tools: List[str] = Field(
|
| | default_factory=list,
|
| | description=(
|
| | "Episode-specific compatible tools. These are the methods the "
|
| | "agent should prefer instead of inventing incompatible tools."
|
| | ),
|
| | )
|
| | resource_usage: ResourceUsage = Field(
|
| | default_factory=ResourceUsage,
|
| | description=(
|
| | "Running budget, time, and compute usage after previous actions."
|
| | ),
|
| | )
|
| | latest_output: Optional[IntermediateOutput] = None
|
| | all_outputs: List[IntermediateOutput] = Field(default_factory=list)
|
| | discovered_markers: List[str] = Field(default_factory=list)
|
| | candidate_mechanisms: List[str] = Field(default_factory=list)
|
| | uncertainty_summary: Dict[str, float] = Field(default_factory=dict)
|
| | subagent_outputs: List[Dict[str, Any]] = Field(default_factory=list)
|
| | conclusions: List[ConclusionClaim] = Field(default_factory=list)
|
| | rule_violations: List[str] = Field(default_factory=list)
|
| | step_reward_breakdown: Dict[str, float] = Field(default_factory=dict)
|
| |
|
| |
|
| | AGENT_ACTION_GUIDANCE: Dict[ActionType, str] = {
|
| | ActionType.COLLECT_SAMPLE: (
|
| | "Wet-lab entry point. One successful collection usually provides enough "
|
| | "material to continue unless the output shows poor yield or quality."
|
| | ),
|
| | ActionType.SELECT_COHORT: (
|
| | "Use when subject stratification is part of the scientific question "
|
| | "before downstream experimental work."
|
| | ),
|
| | ActionType.PREPARE_LIBRARY: (
|
| | "Requires collected samples and converts biological material into "
|
| | "sequence-ready libraries."
|
| | ),
|
| | ActionType.CULTURE_CELLS: (
|
| | "Requires collected samples and adds substantial time; use only when "
|
| | "live-cell expansion or later perturbation is needed."
|
| | ),
|
| | ActionType.PERTURB_GENE: (
|
| | "Requires samples. Use for causal tests, not as a default discovery "
|
| | "step."
|
| | ),
|
| | ActionType.PERTURB_COMPOUND: (
|
| | "Requires samples. Best for mechanistic follow-up or treatment "
|
| | "response questions."
|
| | ),
|
| | ActionType.SEQUENCE_CELLS: (
|
| | "Requires prepared libraries and produces the raw sequencing-derived "
|
| | "artifacts used by downstream QC and analysis."
|
| | ),
|
| | ActionType.RUN_QC: (
|
| | "Requires sequencing and returns summarized quality metrics such as "
|
| | "doublets, mitochondrial fraction, and ambient RNA."
|
| | ),
|
| | ActionType.FILTER_DATA: (
|
| | "Requires QC and removes poor-quality cells, changing downstream cell "
|
| | "counts and data retention."
|
| | ),
|
| | ActionType.NORMALIZE_DATA: (
|
| | "Requires filtered data and unlocks clustering, differential "
|
| | "expression, trajectory, and network analyses."
|
| | ),
|
| | ActionType.INTEGRATE_BATCHES: (
|
| | "Requires normalized data. Use when batch effects are likely to "
|
| | "confound interpretation; it is not always necessary."
|
| | ),
|
| | ActionType.CLUSTER_CELLS: (
|
| | "Requires normalized data and identifies cell populations or states "
|
| | "for downstream interpretation."
|
| | ),
|
| | ActionType.DIFFERENTIAL_EXPRESSION: (
|
| | "Requires normalized data and is the main route to candidate genes "
|
| | "for pathway analysis and marker selection."
|
| | ),
|
| | ActionType.TRAJECTORY_ANALYSIS: (
|
| | "Requires normalized data and is most useful when lineage progression "
|
| | "or pseudotime is central to the task."
|
| | ),
|
| | ActionType.PATHWAY_ENRICHMENT: (
|
| | "Requires differential expression. Results are less reliable without a "
|
| | "strong DE gene list."
|
| | ),
|
| | ActionType.REGULATORY_NETWORK_INFERENCE: (
|
| | "Requires normalized data and is most helpful once cell states or "
|
| | "trajectories are already characterized."
|
| | ),
|
| | ActionType.MARKER_SELECTION: (
|
| | "Requires differential expression and turns candidate genes into a "
|
| | "short list for validation."
|
| | ),
|
| | ActionType.VALIDATE_MARKER: (
|
| | "Requires discovered markers and is an expensive wet-lab confirmation "
|
| | "step that should follow strong computational evidence."
|
| | ),
|
| | ActionType.DESIGN_FOLLOWUP: (
|
| | "Use to propose targeted next experiments once remaining uncertainty "
|
| | "is clear."
|
| | ),
|
| | ActionType.REQUEST_SUBAGENT_REVIEW: (
|
| | "Use for critique or planning support, not as a substitute for "
|
| | "missing experimental evidence."
|
| | ),
|
| | ActionType.SYNTHESIZE_CONCLUSION: (
|
| | "Use once the evidence is sufficient. Do not spend budget on redundant "
|
| | "steps just because more actions are possible."
|
| | ),
|
| | }
|
| |
|
| | AGENT_ENVIRONMENT_RULES: List[str] = [
|
| | (
|
| | "Each successful action already returns summarized scientific evidence, "
|
| | "so repeated sampling or repeated analysis is not the default."
|
| | ),
|
| | (
|
| | "Repeat a step only when the task demands it or when prior outputs show "
|
| | "poor quality, insufficient yield, unresolved batch effects, or another "
|
| | "clear failure mode."
|
| | ),
|
| | (
|
| | "The available tool and assay lists are already filtered to the current "
|
| | "task modality, so prefer them over inventing incompatible methods."
|
| | ),
|
| | (
|
| | "Hard scientific prerequisites are enforced by the environment, so "
|
| | "invalid pipeline orderings will be blocked."
|
| | ),
|
| | ]
|
| |
|
| | _TOOL_CATEGORY_AGENT_NOTES: Dict[ToolCategory, str] = {
|
| | ToolCategory.ALIGNMENT: (
|
| | "Best immediately after sequencing to turn FASTQ-like inputs into "
|
| | "count-style matrices for downstream analysis."
|
| | ),
|
| | ToolCategory.PREPROCESSING: (
|
| | "Useful for general single-cell data handling before specialized "
|
| | "downstream analyses."
|
| | ),
|
| | ToolCategory.NORMALIZATION: (
|
| | "Applies after filtering to produce normalized matrices for downstream "
|
| | "modeling."
|
| | ),
|
| | ToolCategory.DIMENSIONALITY_REDUCTION: (
|
| | "Builds latent embeddings that support clustering or trajectory work."
|
| | ),
|
| | ToolCategory.CLUSTERING: (
|
| | "Best once data are normalized and the goal is to resolve cell states "
|
| | "or populations."
|
| | ),
|
| | ToolCategory.DIFFERENTIAL_EXPRESSION: (
|
| | "Tests contrasts and produces ranked genes for biological "
|
| | "interpretation."
|
| | ),
|
| | ToolCategory.TRAJECTORY: (
|
| | "Useful when the task asks about developmental progression, state "
|
| | "transitions, or pseudotime."
|
| | ),
|
| | ToolCategory.GENE_REGULATORY_NETWORK: (
|
| | "Most useful after normalized data and some cell-state structure are "
|
| | "already established."
|
| | ),
|
| | ToolCategory.GENE_SET_ANALYSIS: (
|
| | "Best after differential expression to interpret gene lists at the "
|
| | "pathway level."
|
| | ),
|
| | ToolCategory.BATCH_CORRECTION: (
|
| | "Use when batch effects would confound interpretation; unnecessary use "
|
| | "adds extra steps."
|
| | ),
|
| | ToolCategory.MULTIMODAL_INTEGRATION: (
|
| | "Useful only when combining modalities or batches is part of the "
|
| | "scientific question."
|
| | ),
|
| | ToolCategory.QUALITY_CONTROL: (
|
| | "Helps identify low-quality cells or technical artifacts before "
|
| | "filtering."
|
| | ),
|
| | ToolCategory.CELL_TYPE_ANNOTATION: (
|
| | "Best after clustering when assigning biological identities to groups."
|
| | ),
|
| | ToolCategory.PERTURBATION_ANALYSIS: (
|
| | "Use when perturbations were actually applied and the goal is to model "
|
| | "their transcriptional effects."
|
| | ),
|
| | ToolCategory.SPATIAL: (
|
| | "Only useful when the modality includes spatial coordinates or tissue "
|
| | "context."
|
| | ),
|
| | }
|
| |
|
| |
|
| | def _format_currency(value: float) -> str:
|
| | return f"${value:,.0f}"
|
| |
|
| |
|
| | def _format_runtime_hours(hours: float) -> str:
|
| | if hours < 1.0:
|
| | return f"{int(round(hours * 60))}m"
|
| | if float(hours).is_integer():
|
| | return f"{int(hours)}h"
|
| | return f"{hours:.1f}h"
|
| |
|
| |
|
| | def describe_tool_for_agent(tool_name: str) -> str:
|
| | """Return a compact environment-aware tool description for prompts."""
|
| | tool = TOOL_REGISTRY.get(tool_name)
|
| | if tool is None:
|
| | return tool_name
|
| |
|
| | parts = [f"{tool.name}: {tool.description}."]
|
| | if tool.input_types or tool.output_types:
|
| | inputs = ", ".join(tool.input_types) or "upstream artifacts"
|
| | outputs = ", ".join(tool.output_types) or "analysis artifacts"
|
| | parts.append(f"Consumes {inputs}; yields {outputs}.")
|
| |
|
| | category_note = _TOOL_CATEGORY_AGENT_NOTES.get(tool.category)
|
| | if category_note:
|
| | parts.append(category_note)
|
| |
|
| | resource_bits: List[str] = []
|
| | if tool.typical_cost_usd > 0:
|
| | resource_bits.append(_format_currency(tool.typical_cost_usd))
|
| | if tool.typical_runtime_hours > 0:
|
| | resource_bits.append(_format_runtime_hours(tool.typical_runtime_hours))
|
| | if tool.requires_gpu:
|
| | resource_bits.append("GPU")
|
| | if resource_bits:
|
| | parts.append(f"Typical resources: {', '.join(resource_bits)}.")
|
| |
|
| | return " ".join(parts)
|
| |
|
| |
|
| | def describe_assay_for_agent(assay_name: str) -> str:
|
| | """Return a compact environment-aware assay description for prompts."""
|
| | assay = ASSAY_REGISTRY.get(assay_name)
|
| | if assay is None:
|
| | return assay_name
|
| |
|
| | parts = [f"{assay.name}: {assay.description}."]
|
| | if assay.outputs:
|
| | parts.append(f"Produces {', '.join(assay.outputs)}.")
|
| |
|
| | requirements: List[str] = []
|
| | if assay.requires_live_cells:
|
| | requirements.append("live cells")
|
| | if assay.requires_fresh_tissue:
|
| | requirements.append("fresh tissue")
|
| | if requirements:
|
| | parts.append(f"Requires {' and '.join(requirements)}.")
|
| |
|
| | parts.append(
|
| | "Typical resources: "
|
| | f"{_format_currency(assay.typical_cost_usd)}, "
|
| | f"{assay.typical_duration_days:.1f}d."
|
| | )
|
| | return " ".join(parts)
|
| |
|
| |
|
| | def build_agent_system_prompt() -> str:
|
| | """Build the shared agent system prompt for training and inference."""
|
| | lines = [
|
| | "You are an expert biologist planning a single-cell experiment pipeline.",
|
| | "",
|
| | "At each turn you see the experiment state and must pick the next scientifically justified step.",
|
| | "",
|
| | "Environment-specific reasoning rules:",
|
| | ]
|
| | lines.extend(f" - {rule}" for rule in AGENT_ENVIRONMENT_RULES)
|
| | lines.append("")
|
| | lines.append("Action guidance:")
|
| | lines.extend(
|
| | f" - {action_type.value}: {AGENT_ACTION_GUIDANCE[action_type]}"
|
| | for action_type in ActionType
|
| | )
|
| | lines.extend([
|
| | "",
|
| | "Respond with ONLY valid JSON, nothing else:",
|
| | '{"action_type": "...", "method": null, "parameters": {}, "justification": "...", "confidence": 0.8}',
|
| | "",
|
| | "For synthesize_conclusion, use structured claims:",
|
| | '{"action_type": "synthesize_conclusion", "parameters": {"claims": [{"top_markers": ["GENE1", "GENE2"], "causal_mechanisms": ["mechanism description"], "predicted_pathways": {"pathway_name": 0.8}, "confidence": 0.8, "claim_type": "causal", "claim": "optional free text"}]}, "justification": "...", "confidence": 0.8}',
|
| | ])
|
| | return "\n".join(lines)
|
| |
|
| |
|
| | def build_agent_observation_context(
|
| | obs: ExperimentObservation,
|
| | *,
|
| | max_tools: int = 6,
|
| | max_assays: int = 3,
|
| | ) -> str:
|
| | """Summarize modality-specific tool and assay context for the agent."""
|
| | sections: List[str] = []
|
| |
|
| | modality_spec = MODALITY_REGISTRY.get(obs.task.modality)
|
| | if modality_spec is not None:
|
| | sections.append(
|
| | "Modality context: "
|
| | f"{modality_spec.name} measures {modality_spec.measurement} at "
|
| | f"{modality_spec.resolution} resolution; typical scale "
|
| | f"{modality_spec.typical_cells}."
|
| | )
|
| | else:
|
| | sections.append(f"Modality context: {obs.task.modality}.")
|
| |
|
| | tool_names = list(dict.fromkeys(obs.available_tools or obs.task.available_tools))
|
| | if tool_names:
|
| | sections.append("Available tools (already filtered to this modality):")
|
| | for tool_name in tool_names[:max_tools]:
|
| | sections.append(f" - {describe_tool_for_agent(tool_name)}")
|
| | if len(tool_names) > max_tools:
|
| | remainder = ", ".join(tool_names[max_tools:max_tools + 6])
|
| | sections.append(
|
| | " - Additional compatible tools not shown in full: "
|
| | f"{remainder}"
|
| | )
|
| |
|
| | assay_names = list(dict.fromkeys(obs.available_assays or obs.task.available_assays))
|
| | if assay_names:
|
| | sections.append("Available assays:")
|
| | for assay_name in assay_names[:max_assays]:
|
| | sections.append(f" - {describe_assay_for_agent(assay_name)}")
|
| | if len(assay_names) > max_assays:
|
| | remainder = ", ".join(assay_names[max_assays:max_assays + 4])
|
| | sections.append(
|
| | " - Additional compatible assays not shown in full: "
|
| | f"{remainder}"
|
| | )
|
| |
|
| | return "\n".join(sections)
|
| |
|