""" OpenEnv-compatible data models for the ODSE (Open Data Science Environment). Three types are defined: * OdseAction - action with action_type and optional code. * OdseObservation - extends OpenEnv Observation with all ODSE fields. The core Observation is kept free of OpenEnv dependencies; this class bridges the two. * OdseState - extends OpenEnv State with data-science episode metadata. """ from typing import Any, Dict, List, Literal, Optional from openenv.core.env_server.types import Action, Observation as OpenEnvObservation, State from pydantic import Field from odse.core.models import DatasetInfo, ExecutionStatus, VariableInfo # ============================================================================== # Action # ============================================================================== class OdseAction(Action): """Action for the ODSE environment. Two action types are supported: * run_code - execute Python code in the persistent sandbox. * submit - submit predictions and end the episode. """ action_type: Literal["run_code", "submit"] = Field( ..., description="Type of action: 'run_code' or 'submit'" ) code: Optional[str] = Field( default=None, description="Python code to execute (required when action_type='run_code')", ) # ============================================================================== # Observation # ============================================================================== class OdseObservation(OpenEnvObservation): """OpenEnv-compatible observation for the ODSE environment. Extends OpenEnv Observation (done, reward, metadata) with all ODSE-specific fields. The core Observation has no OpenEnv dependency; the server wrapper constructs this class from core output. """ # -- Execution result (empty on reset) ------------------------------------- stdout: str = Field( default="", description="Captured stdout from last code execution", ) stderr: str = Field( default="", description="Captured stderr / traceback from last execution", ) execution_status: Optional[ExecutionStatus] = Field( default=None, description="Status of the last code execution (None on reset)", ) execution_time_ms: float = Field( default=0.0, ge=0.0, description="Wall-clock time of last execution in milliseconds", ) # -- Workspace state ------------------------------------------------------- namespace_summary: List[VariableInfo] = Field( default_factory=list, description="User-visible variables in the sandbox namespace", ) # -- Scoring --------------------------------------------------------------- validation_score: Optional[float] = Field( default=None, description="Latest validation score (from evaluate or auto-detected)", ) best_validation_score: Optional[float] = Field( default=None, description="Best validation score achieved this episode", ) test_score: Optional[float] = Field( default=None, description="Test-set score (populated only after submit)", ) test_report: Optional[Dict[str, Any]] = Field( default=None, description="Full test evaluation report (populated only after submit)", ) # -- Episode context ------------------------------------------------------- step_count: int = Field( default=0, ge=0, description="Steps taken so far", ) max_steps: int = Field( default=20, ge=1, description="Step budget for this episode", ) dataset_info: Optional[DatasetInfo] = Field( default=None, description="Metadata about the dataset", ) task_description: str = Field( default="", description="Human-readable description of the agent objective", ) # -- Step diagnostics (named field - survives serialize_observation) ------- info: Dict[str, Any] = Field( default_factory=dict, description="Step diagnostics (scores, timing, breakdown)", ) # ============================================================================== # State # ============================================================================== class OdseState(State): """Episode state for the ODSE environment.""" dataset_name: str = Field( default="", description="Name of the dataset (e.g. 'breast_cancer', 'iris')" ) difficulty: str = Field( default="easy", description="Difficulty level: 'easy', 'medium', or 'hard'" ) problem_type: str = Field( default="", description="'classification' or 'regression'" ) target_column: str = Field( default="", description="Name of the target column in the dataset" ) problem_description: str = Field( default="", description="Human-readable description of the dataset objective" ) metric: str = Field( default="", description="Primary evaluation metric (e.g. 'accuracy', 'r2')" ) max_steps: int = Field( default=20, ge=1, description="Maximum code-execution steps for this episode" ) done: bool = Field( default=False, description="Whether the episode has ended" ) best_validation_score: Optional[float] = Field( default=None, description="Best validation score achieved so far this episode", ) latest_validation_score: Optional[float] = Field( default=None, description="Most recent validation score", )