File size: 5,030 Bytes
df98fca
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
"""Latent biological and technical state — hidden from the agent."""

from __future__ import annotations

from typing import Any, Dict, List, Optional

from pydantic import BaseModel, Field


class CellPopulation(BaseModel):
    """Ground-truth cell sub-population in the simulated tissue."""

    name: str
    proportion: float = Field(ge=0.0, le=1.0)
    marker_genes: List[str] = Field(default_factory=list)
    state: str = "quiescent"
    condition_response: Dict[str, float] = Field(default_factory=dict)


class GeneProgram(BaseModel):
    """A latent gene-regulatory programme."""

    name: str
    genes: List[str] = Field(default_factory=list)
    activity_level: float = Field(0.5, ge=0.0, le=1.0)
    condition_dependent: bool = False
    conditions_active: List[str] = Field(default_factory=list)


class LatentBiologicalState(BaseModel):
    """Hidden ground-truth biology the agent cannot directly observe."""

    cell_populations: List[CellPopulation] = Field(default_factory=list)
    true_de_genes: Dict[str, Dict[str, float]] = Field(
        default_factory=dict,
        description="comparison_key → {gene: log2FC}",
    )
    true_pathways: Dict[str, float] = Field(
        default_factory=dict,
        description="pathway → activity level",
    )
    gene_programs: List[GeneProgram] = Field(default_factory=list)
    true_trajectory: Optional[Dict[str, Any]] = None
    true_regulatory_network: Dict[str, List[str]] = Field(
        default_factory=dict,
        description="TF → target genes",
    )
    perturbation_effects: Dict[str, Dict[str, float]] = Field(
        default_factory=dict,
        description="perturbation → {gene: effect_size}",
    )
    confounders: Dict[str, float] = Field(default_factory=dict)
    true_markers: List[str] = Field(default_factory=list)
    causal_mechanisms: List[str] = Field(default_factory=list)
    n_true_cells: int = 10_000


class TechnicalState(BaseModel):
    """Hidden technical parameters that shape experimental noise."""

    batch_effects: Dict[str, float] = Field(default_factory=dict)
    ambient_rna_fraction: float = 0.05
    doublet_rate: float = 0.04
    dropout_rate: float = 0.1
    sample_quality: float = Field(0.9, ge=0.0, le=1.0)
    library_complexity: float = Field(0.8, ge=0.0, le=1.0)
    sequencing_depth_factor: float = 1.0
    capture_efficiency: float = 0.6


class ExperimentProgress(BaseModel):
    """Flags tracking which experiment stages have been completed."""

    samples_collected: bool = False
    cohort_selected: bool = False
    cells_cultured: bool = False
    library_prepared: bool = False
    perturbation_applied: bool = False
    cells_sequenced: bool = False
    qc_performed: bool = False
    data_filtered: bool = False
    data_normalized: bool = False
    batches_integrated: bool = False
    cells_clustered: bool = False
    de_performed: bool = False
    trajectories_inferred: bool = False
    pathways_analyzed: bool = False
    networks_inferred: bool = False
    markers_discovered: bool = False
    markers_validated: bool = False
    conclusion_reached: bool = False

    n_cells_after_filter: Optional[int] = None
    n_clusters_found: Optional[int] = None
    n_de_genes_found: Optional[int] = None
    n_markers_found: Optional[int] = None


class ResourceState(BaseModel):
    """Full internal resource tracking (superset of agent-visible ResourceUsage)."""

    budget_total: float = 100_000.0
    budget_used: float = 0.0
    time_limit_days: float = 180.0
    time_used_days: float = 0.0
    samples_available: int = 0
    samples_consumed: int = 0
    compute_hours_used: float = 0.0
    sequencing_lanes_used: int = 0
    reagent_kits_used: int = 0

    @property
    def budget_remaining(self) -> float:
        return max(0.0, self.budget_total - self.budget_used)

    @property
    def time_remaining_days(self) -> float:
        return max(0.0, self.time_limit_days - self.time_used_days)

    @property
    def budget_exhausted(self) -> bool:
        return self.budget_remaining <= 0

    @property
    def time_exhausted(self) -> bool:
        return self.time_remaining_days <= 0


class FullLatentState(BaseModel):
    """Complete hidden state of the simulated biological world."""

    biology: LatentBiologicalState = Field(
        default_factory=LatentBiologicalState
    )
    technical: TechnicalState = Field(default_factory=TechnicalState)
    progress: ExperimentProgress = Field(default_factory=ExperimentProgress)
    resources: ResourceState = Field(default_factory=ResourceState)
    hidden_failure_conditions: List[str] = Field(default_factory=list)
    mechanism_confidence: Dict[str, float] = Field(default_factory=dict)
    discovered_de_genes: List[str] = Field(default_factory=list)
    discovered_clusters: List[str] = Field(default_factory=list)
    step_count: int = 0
    rng_seed: int = 42