Spaces:

simeetnayan
/

odse

Sleeping

App Files Files Community

odse / core /models.py

simeetnayan

Upload folder using huggingface_hub

4e680fd verified about 1 month ago

raw

history blame contribute delete

7.34 kB

	"""Pydantic models for the ODSE Sandbox Environment.

	Defines the two actions (RunCode, Submit), observations, step results,
	and supporting types for a code-execution sandbox where agents write
	and execute Python code to solve data-science tasks.

	Architecture
	------------
	Instead of a fixed DSL with enumerated action types, the sandbox
	exposes only two actions:

	* ``RunCodeAction`` : execute arbitrary Python in a persistent namespace.
	* ``SubmitAction`` : submit predictions and terminate the episode.

	The observation gives the agent execution feedback (stdout/stderr),
	workspace state (variables, shapes), scoring context, and dataset
	metadata so it can plan its next code cell.
	"""

	from __future__ import annotations

	from enum import Enum
	from typing import Any, Dict, List, Literal, Optional, Tuple, Union

	from pydantic import BaseModel, Field
	from typing_extensions import Annotated


	# ============================================================================
	# Enums
	# ============================================================================

	class ProblemType(str, Enum):
	"""Type of ML problem the agent must solve."""

	CLASSIFICATION = "classification"
	REGRESSION = "regression"


	class Difficulty(str, Enum):
	"""Difficulty Level - controls dataset noise, nulls, and step budget."""

	EASY = "easy"
	MEDIUM = "medium"
	HARD = "hard"


	class ExecutionStatus(str, Enum):
	"""Outcome of a single code execution."""

	SUCCESS = "success"
	ERROR = "error"
	TIMEOUT = "timeout"


	# ============================================================================
	# Actions
	# ============================================================================

	class RunCodeAction(BaseModel):
	"""Execute Python code in the sandbox.

	The code runs in a persistent namespace pre-loaded with:

	* ``train_df`` : Training DataFrame (features + target)
	* ``val_features`` : Validation features (target hidden)
	* ``test_features`` : Test features (target hidden)
	* ``target_column`` : Name of the target column (str)
	* ``pd``, ``np`` : pandas and numpy
	* ``evaluate(preds)`` : Score predictions against hidden validation labels

	Variables persist across ``RunCode`` calls (notebook-style kernel).
	Assign your test-set predictions to the variable ``predictions``
	before calling ``SubmitAction``.
	"""

	action_type: Literal["run_code"] = Field(default="run_code", frozen=True)
	code: str = Field(description="Python code to execute in the sandbox")


	class SubmitAction(BaseModel):
	"""Submit predictions and terminate the episode.

	Reads the ``predictions`` variable from the sandbox namespace and
	scores it against the hidden test labels. The variable must be
	an array-like whose length matches ``test_features``.
	"""

	action_type: Literal["submit"] = Field(default="submit", frozen=True)


	Action = Annotated[
	Union[RunCodeAction, SubmitAction],
	Field(discriminator="action_type"),
	]


	# ============================================================================
	# Dataset / Column metadata
	# ============================================================================

	class ColumnSchema(BaseModel):
	"""Schema information for a single column in the dataset."""

	name: str
	dtype: str
	null_count: int = Field(ge=0)
	is_numeric: bool
	unique_count: int = Field(ge=0)
	sample_values: List[Any] = Field(default_factory=list, max_length=5)


	class DatasetInfo(BaseModel):
	"""Metadata about the dataset, provided to the agent on every observation."""

	train_shape: Tuple[int, int]
	val_shape: Tuple[int, int]
	test_shape: Tuple[int, int]
	target_column: str
	problem_description: str = ""
	problem_type: str # "classification" or "regression"
	metric: str # primary metric name (e.g. "accuracy", "r2")
	columns: List[ColumnSchema]
	target_classes: Optional[List[Any]] = None # classification only
	target_stats: Optional[Dict[str, float]] = None # regression only


	# ============================================================================
	# Namespace summary
	# ============================================================================

	class VariableInfo(BaseModel):
	"""Summary of one variable in the agent's sandbox namespace."""

	name: str
	type_name: str
	shape: Optional[Tuple[int, ...]] = None
	preview: str = Field(default="", max_length=500)


	# ============================================================================
	# Observation
	# ============================================================================

	class Observation(BaseModel):
	"""Observation returned after every ``reset()`` or ``step()``."""

	# - Execution result (empty on reset) ------------------------------------
	stdout: str = Field(
	default="",
	description="Captured stdout from last code execution",
	)
	stderr: str = Field(
	default="",
	description="Captured stderr / traceback from last execution",
	)
	execution_status: Optional[ExecutionStatus] = Field(
	default=None,
	description="Status of the last code execution (None on reset)",
	)
	execution_time_ms: float = Field(
	default=0.0,
	ge=0.0,
	description="Wall-clock time of last execution in milliseconds",
	)

	# - Workspace state ------------------------------------------------------
	namespace_summary: List[VariableInfo] = Field(
	default_factory=list,
	description="User-visible variables in the sandbox namespace",
	)

	# - Scoring --------------------------------------------------------------
	validation_score: Optional[float] = Field(
	default=None,
	description="Latest validation score (from evaluate() or auto-detected)",
	)
	best_validation_score: Optional[float] = Field(
	default=None,
	description="Best validation score achieved this episode",
	)

	test_score: Optional[float] = Field(
	default=None,
	description="Test-set score (populated only after submit)",
	)
	test_report: Optional[Dict[str, Any]] = Field(
	default=None,
	description="Full test evaluation report (populated only after submit)",
	)


	# - Episode context ------------------------------------------------------
	step_count: int = Field(ge=0, description="Steps taken so far")
	max_steps: int = Field(ge=1, description="Step budget for this episode")
	dataset_info: DatasetInfo
	task_description: str = Field(
	description="Human-readable description of the agent's objective",
	)
	done: bool = Field(default=False, description="Whether the episode has ended")


	# ============================================================================
	# Step result
	# ============================================================================

	class StepResult(BaseModel):
	"""Result of a single ``env.step()`` call."""

	observation: Observation
	reward: float = Field(description="Scalar reward for this step")
	done: bool = Field(description="Whether the episode has terminated")
	info: Dict[str, Any] = Field(
	default_factory=dict,
	description="Additional diagnostics (scores, timing, breakdown, ...)",
	)