File size: 5,555 Bytes
4db0438
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5c3cfae
 
4db0438
 
5c3cfae
4db0438
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5c3cfae
4db0438
 
5c3cfae
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
"""Latent biological and technical state — hidden from the agent."""

from __future__ import annotations

from typing import Any, Dict, List, Optional

from pydantic import BaseModel, Field


class CellPopulation(BaseModel):
    """Ground-truth cell sub-population in the simulated tissue."""

    name: str
    proportion: float = Field(ge=0.0, le=1.0)
    marker_genes: List[str] = Field(default_factory=list)
    state: str = "quiescent"
    condition_response: Dict[str, float] = Field(default_factory=dict)


class GeneProgram(BaseModel):
    """A latent gene-regulatory programme."""

    name: str
    genes: List[str] = Field(default_factory=list)
    activity_level: float = Field(0.5, ge=0.0, le=1.0)
    condition_dependent: bool = False
    conditions_active: List[str] = Field(default_factory=list)


class LatentBiologicalState(BaseModel):
    """Hidden ground-truth biology the agent cannot directly observe."""

    cell_populations: List[CellPopulation] = Field(default_factory=list)
    true_de_genes: Dict[str, Dict[str, float]] = Field(
        default_factory=dict,
        description="comparison_key → {gene: log2FC}",
    )
    true_pathways: Dict[str, float] = Field(
        default_factory=dict,
        description="pathway → activity level",
    )
    gene_programs: List[GeneProgram] = Field(default_factory=list)
    true_trajectory: Optional[Dict[str, Any]] = None
    true_regulatory_network: Dict[str, List[str]] = Field(
        default_factory=dict,
        description="TF → target genes",
    )
    perturbation_effects: Dict[str, Dict[str, float]] = Field(
        default_factory=dict,
        description="perturbation → {gene: effect_size}",
    )
    confounders: Dict[str, float] = Field(default_factory=dict)
    true_markers: List[str] = Field(default_factory=list)
    causal_mechanisms: List[str] = Field(default_factory=list)
    n_true_cells: int = 10_000


class TechnicalState(BaseModel):
    """Hidden technical parameters that shape experimental noise."""

    batch_effects: Dict[str, float] = Field(default_factory=dict)
    ambient_rna_fraction: float = 0.05
    doublet_rate: float = 0.04
    dropout_rate: float = 0.1
    sample_quality: float = Field(0.9, ge=0.0, le=1.0)
    library_complexity: float = Field(0.8, ge=0.0, le=1.0)
    sequencing_depth_factor: float = 1.0
    capture_efficiency: float = 0.6


class ExperimentProgress(BaseModel):
    """Flags tracking which experiment stages have been completed."""

    samples_collected: bool = False
    cohort_selected: bool = False
    cells_cultured: bool = False
    library_prepared: bool = False
    perturbation_applied: bool = False
    cells_sequenced: bool = False
    qc_performed: bool = False
    data_filtered: bool = False
    data_normalized: bool = False
    batches_integrated: bool = False
    cells_clustered: bool = False
    de_performed: bool = False
    trajectories_inferred: bool = False
    pathways_analyzed: bool = False
    networks_inferred: bool = False
    markers_discovered: bool = False
    markers_validated: bool = False
    followup_designed: bool = False
    subagent_review_requested: bool = False
    conclusion_reached: bool = False

    n_cells_sequenced: Optional[int] = None
    n_cells_after_filter: Optional[int] = None
    n_clusters_found: Optional[int] = None
    n_de_genes_found: Optional[int] = None
    n_markers_found: Optional[int] = None


class ResourceState(BaseModel):
    """Full internal resource tracking (superset of agent-visible ResourceUsage)."""

    budget_total: float = 100_000.0
    budget_used: float = 0.0
    time_limit_days: float = 180.0
    time_used_days: float = 0.0
    samples_available: int = 0
    samples_consumed: int = 0
    compute_hours_used: float = 0.0
    sequencing_lanes_used: int = 0
    reagent_kits_used: int = 0

    @property
    def budget_remaining(self) -> float:
        return max(0.0, self.budget_total - self.budget_used)

    @property
    def time_remaining_days(self) -> float:
        return max(0.0, self.time_limit_days - self.time_used_days)

    @property
    def budget_exhausted(self) -> bool:
        return self.budget_remaining <= 0

    @property
    def time_exhausted(self) -> bool:
        return self.time_remaining_days <= 0


class FullLatentState(BaseModel):
    """Complete hidden state of the simulated biological world."""

    biology: LatentBiologicalState = Field(
        default_factory=LatentBiologicalState
    )
    technical: TechnicalState = Field(default_factory=TechnicalState)
    progress: ExperimentProgress = Field(default_factory=ExperimentProgress)
    resources: ResourceState = Field(default_factory=ResourceState)
    hidden_failure_conditions: List[str] = Field(default_factory=list)
    mechanism_confidence: Dict[str, float] = Field(default_factory=dict)
    discovered_de_genes: List[str] = Field(default_factory=list)
    discovered_clusters: List[str] = Field(default_factory=list)
    task_modality: str = "scRNA-seq"
    step_count: int = 0
    rng_seed: int = 42

    # Transient fields for passing sampled values from the transition engine
    # to the output generator within a single step (not serialized).
    last_retain_frac: Optional[float] = Field(None, exclude=True)
    last_n_clusters: Optional[int] = Field(None, exclude=True)
    last_perturbation_efficiency: Optional[float] = Field(None, exclude=True)