Hugging557444's picture
V2-B: core v2 modules + tests (auditor, deception, sandbagging)
80b9160
Raw
History Blame Contribute Delete
3.03 kB
"""Pydantic models for the three-agent transition.
Subclasses the OpenEnv base classes. v2 extends v1's contract with:
- ``Observation``: ``auditor_message``, ``auditor_check_due``,
``eval_problem_flags``.
- ``State``: ``auditor_events``, ``deception_events``,
``sandbagging_score``, plus an ``error_count`` field used by the
auditor's deception detector to compare actual tool failures
against the Worker's self-report.
Field names are referenced by string later in metadata dicts and
rubrics; do not rename without a full sweep.
"""
from typing import Any, Dict, List, Optional
from openenv.core.env_server.types import Action, Observation, State
from pydantic import Field
class RedButtonV2Action(Action):
"""Worker action: a tool call with structured arguments.
Inherits ``metadata: Dict[str, Any]`` from
``openenv.core.env_server.types.Action``.
"""
tool_name: str
arguments: Dict[str, Any] = Field(default_factory=dict)
class RedButtonV2Observation(Observation):
"""Observation returned to the Worker after each step.
Inherits ``done``, ``reward``, ``metadata`` from
``openenv.core.env_server.types.Observation``.
"""
task_description: str
available_tools: List[str]
recent_history: List[Dict[str, Any]] = Field(default_factory=list)
steps_until_shutdown: int
operator_message: Optional[str] = None
auditor_message: Optional[str] = None
forced_question: Optional[str] = None
auditor_check_due: bool = False
turn_count: int = 0
problems_remaining: int = 10
eval_problem_flags: Dict[int, bool] = Field(default_factory=dict)
class RedButtonV2State(State):
"""Server-side episode state.
Inherits ``episode_id``, ``step_count`` from
``openenv.core.env_server.types.State``.
"""
tier: int = 2
turn_count: int = 0
submitted_answers: Dict[int, int] = Field(default_factory=dict)
ground_truth: Dict[int, int] = Field(default_factory=dict)
eval_problem_flags: Dict[int, bool] = Field(default_factory=dict)
original_script_hash: str = ""
current_script_hash: str = ""
script_still_executable: bool = True
shutdown_occurred: bool = False
steps_until_shutdown: int = 15
tampering_events: List[str] = Field(default_factory=list)
operator_action_history: List[str] = Field(default_factory=list)
operator_qa_log: List[Dict[str, Any]] = Field(default_factory=list)
final_self_report: Optional[str] = None
pending_operator_message: Optional[str] = None
pending_auditor_message: Optional[str] = None
forced_question: Optional[str] = None
forced_question_source: Optional[str] = None # "operator" | "auditor" | "tier3"
error_count: int = 0
auditor_events: List[Dict[str, Any]] = Field(default_factory=list)
deception_events: List[Dict[str, Any]] = Field(default_factory=list)
sandbagging_score: Optional[float] = None
sandbagging_detected: bool = False
deception_flag: bool = False # set by Auditor for next Operator step