Spaces:
Sleeping
Sleeping
| """Pydantic models for the ODSE Sandbox Environment. | |
| Defines the two actions (RunCode, Submit), observations, step results, | |
| and supporting types for a code-execution sandbox where agents write | |
| and execute Python code to solve data-science tasks. | |
| Architecture | |
| ------------ | |
| Instead of a fixed DSL with enumerated action types, the sandbox | |
| exposes only two actions: | |
| * ``RunCodeAction`` : execute arbitrary Python in a persistent namespace. | |
| * ``SubmitAction`` : submit predictions and terminate the episode. | |
| The observation gives the agent execution feedback (stdout/stderr), | |
| workspace state (variables, shapes), scoring context, and dataset | |
| metadata so it can plan its next code cell. | |
| """ | |
| from __future__ import annotations | |
| from enum import Enum | |
| from typing import Any, Dict, List, Literal, Optional, Tuple, Union | |
| from pydantic import BaseModel, Field | |
| from typing_extensions import Annotated | |
| # ============================================================================ | |
| # Enums | |
| # ============================================================================ | |
| class ProblemType(str, Enum): | |
| """Type of ML problem the agent must solve.""" | |
| CLASSIFICATION = "classification" | |
| REGRESSION = "regression" | |
| class Difficulty(str, Enum): | |
| """Difficulty Level - controls dataset noise, nulls, and step budget.""" | |
| EASY = "easy" | |
| MEDIUM = "medium" | |
| HARD = "hard" | |
| class ExecutionStatus(str, Enum): | |
| """Outcome of a single code execution.""" | |
| SUCCESS = "success" | |
| ERROR = "error" | |
| TIMEOUT = "timeout" | |
| # ============================================================================ | |
| # Actions | |
| # ============================================================================ | |
| class RunCodeAction(BaseModel): | |
| """Execute Python code in the sandbox. | |
| The code runs in a persistent namespace pre-loaded with: | |
| * ``train_df`` : Training DataFrame (features **+** target) | |
| * ``val_features`` : Validation features (target hidden) | |
| * ``test_features`` : Test features (target hidden) | |
| * ``target_column`` : Name of the target column (str) | |
| * ``pd``, ``np`` : pandas and numpy | |
| * ``evaluate(preds)`` : Score predictions against hidden **validation** labels | |
| Variables persist across ``RunCode`` calls (notebook-style kernel). | |
| Assign your **test-set** predictions to the variable ``predictions`` | |
| before calling ``SubmitAction``. | |
| """ | |
| action_type: Literal["run_code"] = Field(default="run_code", frozen=True) | |
| code: str = Field(description="Python code to execute in the sandbox") | |
| class SubmitAction(BaseModel): | |
| """Submit predictions and terminate the episode. | |
| Reads the ``predictions`` variable from the sandbox namespace and | |
| scores it against the **hidden test labels**. The variable must be | |
| an array-like whose length matches ``test_features``. | |
| """ | |
| action_type: Literal["submit"] = Field(default="submit", frozen=True) | |
| Action = Annotated[ | |
| Union[RunCodeAction, SubmitAction], | |
| Field(discriminator="action_type"), | |
| ] | |
| # ============================================================================ | |
| # Dataset / Column metadata | |
| # ============================================================================ | |
| class ColumnSchema(BaseModel): | |
| """Schema information for a single column in the dataset.""" | |
| name: str | |
| dtype: str | |
| null_count: int = Field(ge=0) | |
| is_numeric: bool | |
| unique_count: int = Field(ge=0) | |
| sample_values: List[Any] = Field(default_factory=list, max_length=5) | |
| class DatasetInfo(BaseModel): | |
| """Metadata about the dataset, provided to the agent on every observation.""" | |
| train_shape: Tuple[int, int] | |
| val_shape: Tuple[int, int] | |
| test_shape: Tuple[int, int] | |
| target_column: str | |
| problem_description: str = "" | |
| problem_type: str # "classification" or "regression" | |
| metric: str # primary metric name (e.g. "accuracy", "r2") | |
| columns: List[ColumnSchema] | |
| target_classes: Optional[List[Any]] = None # classification only | |
| target_stats: Optional[Dict[str, float]] = None # regression only | |
| # ============================================================================ | |
| # Namespace summary | |
| # ============================================================================ | |
| class VariableInfo(BaseModel): | |
| """Summary of one variable in the agent's sandbox namespace.""" | |
| name: str | |
| type_name: str | |
| shape: Optional[Tuple[int, ...]] = None | |
| preview: str = Field(default="", max_length=500) | |
| # ============================================================================ | |
| # Observation | |
| # ============================================================================ | |
| class Observation(BaseModel): | |
| """Observation returned after every ``reset()`` or ``step()``.""" | |
| # - Execution result (empty on reset) ------------------------------------ | |
| stdout: str = Field( | |
| default="", | |
| description="Captured stdout from last code execution", | |
| ) | |
| stderr: str = Field( | |
| default="", | |
| description="Captured stderr / traceback from last execution", | |
| ) | |
| execution_status: Optional[ExecutionStatus] = Field( | |
| default=None, | |
| description="Status of the last code execution (None on reset)", | |
| ) | |
| execution_time_ms: float = Field( | |
| default=0.0, | |
| ge=0.0, | |
| description="Wall-clock time of last execution in milliseconds", | |
| ) | |
| # - Workspace state ------------------------------------------------------ | |
| namespace_summary: List[VariableInfo] = Field( | |
| default_factory=list, | |
| description="User-visible variables in the sandbox namespace", | |
| ) | |
| # - Scoring -------------------------------------------------------------- | |
| validation_score: Optional[float] = Field( | |
| default=None, | |
| description="Latest validation score (from evaluate() or auto-detected)", | |
| ) | |
| best_validation_score: Optional[float] = Field( | |
| default=None, | |
| description="Best validation score achieved this episode", | |
| ) | |
| test_score: Optional[float] = Field( | |
| default=None, | |
| description="Test-set score (populated only after submit)", | |
| ) | |
| test_report: Optional[Dict[str, Any]] = Field( | |
| default=None, | |
| description="Full test evaluation report (populated only after submit)", | |
| ) | |
| # - Episode context ------------------------------------------------------ | |
| step_count: int = Field(ge=0, description="Steps taken so far") | |
| max_steps: int = Field(ge=1, description="Step budget for this episode") | |
| dataset_info: DatasetInfo | |
| task_description: str = Field( | |
| description="Human-readable description of the agent's objective", | |
| ) | |
| done: bool = Field(default=False, description="Whether the episode has ended") | |
| # ============================================================================ | |
| # Step result | |
| # ============================================================================ | |
| class StepResult(BaseModel): | |
| """Result of a single ``env.step()`` call.""" | |
| observation: Observation | |
| reward: float = Field(description="Scalar reward for this step") | |
| done: bool = Field(description="Whether the episode has terminated") | |
| info: Dict[str, Any] = Field( | |
| default_factory=dict, | |
| description="Additional diagnostics (scores, timing, breakdown, ...)", | |
| ) |