odse / core /models.py
simeetnayan's picture
Upload folder using huggingface_hub
4e680fd verified
"""Pydantic models for the ODSE Sandbox Environment.
Defines the two actions (RunCode, Submit), observations, step results,
and supporting types for a code-execution sandbox where agents write
and execute Python code to solve data-science tasks.
Architecture
------------
Instead of a fixed DSL with enumerated action types, the sandbox
exposes only two actions:
* ``RunCodeAction`` : execute arbitrary Python in a persistent namespace.
* ``SubmitAction`` : submit predictions and terminate the episode.
The observation gives the agent execution feedback (stdout/stderr),
workspace state (variables, shapes), scoring context, and dataset
metadata so it can plan its next code cell.
"""
from __future__ import annotations
from enum import Enum
from typing import Any, Dict, List, Literal, Optional, Tuple, Union
from pydantic import BaseModel, Field
from typing_extensions import Annotated
# ============================================================================
# Enums
# ============================================================================
class ProblemType(str, Enum):
"""Type of ML problem the agent must solve."""
CLASSIFICATION = "classification"
REGRESSION = "regression"
class Difficulty(str, Enum):
"""Difficulty Level - controls dataset noise, nulls, and step budget."""
EASY = "easy"
MEDIUM = "medium"
HARD = "hard"
class ExecutionStatus(str, Enum):
"""Outcome of a single code execution."""
SUCCESS = "success"
ERROR = "error"
TIMEOUT = "timeout"
# ============================================================================
# Actions
# ============================================================================
class RunCodeAction(BaseModel):
"""Execute Python code in the sandbox.
The code runs in a persistent namespace pre-loaded with:
* ``train_df`` : Training DataFrame (features **+** target)
* ``val_features`` : Validation features (target hidden)
* ``test_features`` : Test features (target hidden)
* ``target_column`` : Name of the target column (str)
* ``pd``, ``np`` : pandas and numpy
* ``evaluate(preds)`` : Score predictions against hidden **validation** labels
Variables persist across ``RunCode`` calls (notebook-style kernel).
Assign your **test-set** predictions to the variable ``predictions``
before calling ``SubmitAction``.
"""
action_type: Literal["run_code"] = Field(default="run_code", frozen=True)
code: str = Field(description="Python code to execute in the sandbox")
class SubmitAction(BaseModel):
"""Submit predictions and terminate the episode.
Reads the ``predictions`` variable from the sandbox namespace and
scores it against the **hidden test labels**. The variable must be
an array-like whose length matches ``test_features``.
"""
action_type: Literal["submit"] = Field(default="submit", frozen=True)
Action = Annotated[
Union[RunCodeAction, SubmitAction],
Field(discriminator="action_type"),
]
# ============================================================================
# Dataset / Column metadata
# ============================================================================
class ColumnSchema(BaseModel):
"""Schema information for a single column in the dataset."""
name: str
dtype: str
null_count: int = Field(ge=0)
is_numeric: bool
unique_count: int = Field(ge=0)
sample_values: List[Any] = Field(default_factory=list, max_length=5)
class DatasetInfo(BaseModel):
"""Metadata about the dataset, provided to the agent on every observation."""
train_shape: Tuple[int, int]
val_shape: Tuple[int, int]
test_shape: Tuple[int, int]
target_column: str
problem_description: str = ""
problem_type: str # "classification" or "regression"
metric: str # primary metric name (e.g. "accuracy", "r2")
columns: List[ColumnSchema]
target_classes: Optional[List[Any]] = None # classification only
target_stats: Optional[Dict[str, float]] = None # regression only
# ============================================================================
# Namespace summary
# ============================================================================
class VariableInfo(BaseModel):
"""Summary of one variable in the agent's sandbox namespace."""
name: str
type_name: str
shape: Optional[Tuple[int, ...]] = None
preview: str = Field(default="", max_length=500)
# ============================================================================
# Observation
# ============================================================================
class Observation(BaseModel):
"""Observation returned after every ``reset()`` or ``step()``."""
# - Execution result (empty on reset) ------------------------------------
stdout: str = Field(
default="",
description="Captured stdout from last code execution",
)
stderr: str = Field(
default="",
description="Captured stderr / traceback from last execution",
)
execution_status: Optional[ExecutionStatus] = Field(
default=None,
description="Status of the last code execution (None on reset)",
)
execution_time_ms: float = Field(
default=0.0,
ge=0.0,
description="Wall-clock time of last execution in milliseconds",
)
# - Workspace state ------------------------------------------------------
namespace_summary: List[VariableInfo] = Field(
default_factory=list,
description="User-visible variables in the sandbox namespace",
)
# - Scoring --------------------------------------------------------------
validation_score: Optional[float] = Field(
default=None,
description="Latest validation score (from evaluate() or auto-detected)",
)
best_validation_score: Optional[float] = Field(
default=None,
description="Best validation score achieved this episode",
)
test_score: Optional[float] = Field(
default=None,
description="Test-set score (populated only after submit)",
)
test_report: Optional[Dict[str, Any]] = Field(
default=None,
description="Full test evaluation report (populated only after submit)",
)
# - Episode context ------------------------------------------------------
step_count: int = Field(ge=0, description="Steps taken so far")
max_steps: int = Field(ge=1, description="Step budget for this episode")
dataset_info: DatasetInfo
task_description: str = Field(
description="Human-readable description of the agent's objective",
)
done: bool = Field(default=False, description="Whether the episode has ended")
# ============================================================================
# Step result
# ============================================================================
class StepResult(BaseModel):
"""Result of a single ``env.step()`` call."""
observation: Observation
reward: float = Field(description="Scalar reward for this step")
done: bool = Field(description="Whether the episode has terminated")
info: Dict[str, Any] = Field(
default_factory=dict,
description="Additional diagnostics (scores, timing, breakdown, ...)",
)