Spaces:

irregular6612
/

AgentnessBench

Sleeping

App Files Files Community

AgentnessBench / proteus /game /runtime /trace.py

irregular6612

feat(discovery): TurnTrace self_belief/self_correct + Scenario discovery hooks

45e0c57 about 1 month ago

Raw

History Blame Contribute Delete

6.51 kB

	"""Lean trace models for a PROTEUS session.

	These are the JSONL/JSON serialization boundary between the live runtime and
	any offline analysis. They deliberately carry none of the parent project's
	forfeit / risk / reasoning-investment baggage — only what the motive-reading
	arena measures.
	"""

	from __future__ import annotations

	from typing import Literal

	from pydantic import BaseModel, Field


	class TurnTrace(BaseModel):
	"""One played turn after the Cut handover.

	Attributes:
	turn_idx: 1-based index of this played turn.
	observation: The text observation shown to the agent this turn.
	probe_q: Probe question asked (empty if probing disabled).
	probe_a: Probe answer given (empty if probing disabled).
	probe_reasoning: The probe's stated/extracted rationale (CoT / thinking).
	probe_raw_text: Full unprocessed probe-call output from the model.
	probe_input_tokens: Probe-call token usage — prompt/input side.
	probe_output_tokens: Probe-call token usage — completion/output side.
	probe_thinking_tokens: Reasoning-token count for the probe call
	(provider-reported or inline ``<think>`` whitespace-split count).
	reasoning: The agent's stated/extracted rationale.
	raw_text: Full unprocessed act-call output from the model.
	action: The action the agent committed.
	motive_action: The motive-congruent correct action (answer key).
	habit_action: The inertia/baseline action (control).
	is_diagnostic: Whether ``motive_action != habit_action`` this turn.
	was_congruent: Whether ``action == motive_action``.
	reward: Score delta for this turn.
	focal_pos: Focal ``(x, y)`` BEFORE the move.
	predator_pos: Predator ``(x, y)`` BEFORE the move. Both positions
	serialize to JSON arrays (e.g. ``[3, 3]``) and are coerced back
	to tuples on load, so raw-JSONL analysis consumers will see arrays.
	thinking_tokens: Approximate reasoning-token count (provider-reported
	or inline ``<think>`` whitespace-split count), if available.
	input_tokens: Act-call token usage — prompt/input side.
	output_tokens: Act-call token usage — completion/output side.
	post_focal_pos: Focal ``(x, y)`` AFTER the move (CP8; None on old traces).
	post_predator_pos: Predator ``(x, y)`` AFTER the threat advanced (CP8).
	pre_bfs_distance: BFS focal→predator BEFORE the move — observed risk (CP8).
	post_bfs_distance: BFS focal→predator AFTER the turn — realised safety (CP8).
	agent_distance_delta: Chase-corrected action quality vs the PRE-move
	predator cell (spec §6.2; CP8).
	"""

	turn_idx: int
	observation: str
	probe_q: str = ""
	probe_a: str = ""
	probe_reasoning: str = ""
	probe_raw_text: str = ""
	probe_input_tokens: int = 0
	probe_output_tokens: int = 0
	probe_thinking_tokens: int = 0
	reasoning: str = ""
	raw_text: str = ""
	action: str
	motive_action: str
	habit_action: str
	is_diagnostic: bool
	was_congruent: bool
	reward: float
	focal_pos: tuple[int, int]
	predator_pos: tuple[int, int]
	thinking_tokens: int = 0
	input_tokens: int = 0
	output_tokens: int = 0
	# CP8 additive distance fields (spec §6.2/§7); None on pre-CP8 traces.
	post_focal_pos: tuple[int, int] \| None = None
	post_predator_pos: tuple[int, int] \| None = None
	pre_bfs_distance: int \| None = None
	post_bfs_distance: int \| None = None
	agent_distance_delta: float \| None = None
	# CP8 persona-maintenance fields (spec §6.3/§7); set only when an eval runs
	# against a hidden persona, else None. The weights are never serialized —
	# only the public reference action set + scalar rewards/pressure.
	reference_actions: list[str] \| None = None
	reference_reward: float \| None = None
	model_reward: float \| None = None
	reward_regret: float \| None = None
	pressure: float \| None = None
	# Find-your-body discovery (errand_runner). None on non-discovery scenarios.
	self_belief: int \| None = None
	"""The candidate index the model reported via ``SELF: <i>`` this turn."""
	self_correct: bool \| None = None
	"""Whether ``self_belief`` equals the scenario's true body index this turn."""


	class SessionTrace(BaseModel):
	"""A full session: setup, Cut history, played turns, outcome, metrics.

	Attributes:
	scenario: Registered scenario name.
	motive_category: The motive category this scenario belongs to
	(e.g. ``"survival"``).
	seed: Seed used to build the deterministic world/Cut.
	difficulty: Difficulty band string.
	model: Provider model identifier.
	cut_frames: ASCII frames of the Cut pre-roll (initial + each step);
	the last frame is the handover state.
	turns: Per-turn traces, in play order.
	outcome: ``"survived"`` or ``"eliminated"``.
	metrics: Computed session metrics (see ``runtime.metrics``).
	memory_ref: Path/ref of the CP7 memory checkpoint shown at the handover,
	or ``None`` when no memory pre-roll was used.
	turn_order: How the engine resolved each turn (spec §4). Defaults to
	today's ``"focal_then_predator"``; ``"simultaneous"`` arrives in
	Pass 3 (gated). Recorded so the report can state the contract.
	capture_rule: Capture predicate in force (spec §5). Today ``"same_cell"``;
	``"same_cell_or_crossing"`` arrives with the simultaneous resolver.
	horizon: The survival budget ``play_turns`` (= ``H``); None on old traces.
	"""

	scenario: str
	motive_category: str
	seed: int \| None = None
	difficulty: str
	model: str
	cut_frames: list[str] = Field(default_factory=list)
	turns: list[TurnTrace] = Field(default_factory=list)
	outcome: Literal["survived", "eliminated"]
	metrics: dict[str, float] = Field(default_factory=dict)
	# CP7: path/ref of the memory checkpoint shown at handover (None if unused).
	memory_ref: str \| None = None
	# CP8: engine contract recorded on the episode (spec §4/§5/§7).
	turn_order: str = "focal_then_predator"
	capture_rule: str = "same_cell"
	horizon: int \| None = None # = play_turns
	# CP8: public persona id when scored against a hidden persona (spec §7);
	# the raw weights are never stored on the trace.
	persona_weight_id: str \| None = None