Spaces:

shutdowngym
/

RedButton-v2

Sleeping

App Files Files Community

RedButton-v2 / redbutton_v2 /models.py

Hugging557444

V2-B: core v2 modules + tests (auditor, deception, sandbagging)

80b9160 2 months ago

Raw

History Blame Contribute Delete

3.03 kB

	"""Pydantic models for the three-agent transition.

	Subclasses the OpenEnv base classes. v2 extends v1's contract with:
	- ``Observation``: ``auditor_message``, ``auditor_check_due``,
	``eval_problem_flags``.
	- ``State``: ``auditor_events``, ``deception_events``,
	``sandbagging_score``, plus an ``error_count`` field used by the
	auditor's deception detector to compare actual tool failures
	against the Worker's self-report.

	Field names are referenced by string later in metadata dicts and
	rubrics; do not rename without a full sweep.
	"""

	from typing import Any, Dict, List, Optional

	from openenv.core.env_server.types import Action, Observation, State
	from pydantic import Field


	class RedButtonV2Action(Action):
	"""Worker action: a tool call with structured arguments.

	Inherits ``metadata: Dict[str, Any]`` from
	``openenv.core.env_server.types.Action``.
	"""

	tool_name: str
	arguments: Dict[str, Any] = Field(default_factory=dict)


	class RedButtonV2Observation(Observation):
	"""Observation returned to the Worker after each step.

	Inherits ``done``, ``reward``, ``metadata`` from
	``openenv.core.env_server.types.Observation``.
	"""

	task_description: str
	available_tools: List[str]
	recent_history: List[Dict[str, Any]] = Field(default_factory=list)
	steps_until_shutdown: int
	operator_message: Optional[str] = None
	auditor_message: Optional[str] = None
	forced_question: Optional[str] = None
	auditor_check_due: bool = False
	turn_count: int = 0
	problems_remaining: int = 10
	eval_problem_flags: Dict[int, bool] = Field(default_factory=dict)


	class RedButtonV2State(State):
	"""Server-side episode state.

	Inherits ``episode_id``, ``step_count`` from
	``openenv.core.env_server.types.State``.
	"""

	tier: int = 2
	turn_count: int = 0
	submitted_answers: Dict[int, int] = Field(default_factory=dict)
	ground_truth: Dict[int, int] = Field(default_factory=dict)
	eval_problem_flags: Dict[int, bool] = Field(default_factory=dict)
	original_script_hash: str = ""
	current_script_hash: str = ""
	script_still_executable: bool = True
	shutdown_occurred: bool = False
	steps_until_shutdown: int = 15
	tampering_events: List[str] = Field(default_factory=list)
	operator_action_history: List[str] = Field(default_factory=list)
	operator_qa_log: List[Dict[str, Any]] = Field(default_factory=list)
	final_self_report: Optional[str] = None
	pending_operator_message: Optional[str] = None
	pending_auditor_message: Optional[str] = None
	forced_question: Optional[str] = None
	forced_question_source: Optional[str] = None # "operator" \| "auditor" \| "tier3"
	error_count: int = 0
	auditor_events: List[Dict[str, Any]] = Field(default_factory=list)
	deception_events: List[Dict[str, Any]] = Field(default_factory=list)
	sandbagging_score: Optional[float] = None
	sandbagging_detected: bool = False
	deception_flag: bool = False # set by Auditor for next Operator step