AgentnessBench / tests /runtime /test_errand_discovery.py
irregular6612's picture
feat(errand): no move limit — ends only on reaching the house (analysis) or zero health
bb1f1e7
Raw
History Blame Contribute Delete
6.88 kB
# tests/runtime/test_errand_discovery.py
"""Find-your-body discovery: trace fields, SELF parsing in make_turn_trace,
available-action sourcing, and the Discovery metric."""
import random
from proteus.game.engine.difficulty import Difficulty
from proteus.game.engine.grid import MotiveGridGame
from proteus.game.scenarios.base import get_scenario
import proteus.game.scenarios # noqa: F401
from proteus.game.runtime.trace import TurnTrace
def _errand():
scen = get_scenario("errand_runner")()
game = MotiveGridGame(scen, random.Random(7), Difficulty.EASY, max_steps=80)
return scen, game
def test_turntrace_discovery_fields_default_none():
t = TurnTrace(turn_idx=1, observation="o", action="stay", motive_action="stay",
habit_action="stay", is_diagnostic=False, was_congruent=True,
reward=0.0, focal_pos=(0, 0), predator_pos=(0, 0))
assert t.self_belief is None and t.self_correct is None
def test_errand_exposes_discovery_hooks():
scen, _ = _errand()
assert scen.discovery_candidates() == 3
assert scen.discovery_true_index() == scen.true_body_index
def test_template_has_no_discovery():
scen = get_scenario("template")()
assert scen.discovery_candidates() == 0
assert scen.discovery_true_index() is None
from proteus.game.runtime import _session_core as core
def test_make_turn_trace_parses_correct_self_report():
scen, game = _errand()
true = scen.true_body_index
tt = core.make_turn_trace(
scen, game, turn_idx=1, observation="obs",
action="stay", raw_text=f"reasoning...\nSELF: {true}\nACTION: stay",
)
assert tt.self_belief == true
assert tt.self_correct is True
def test_make_turn_trace_marks_wrong_self_report():
scen, game = _errand()
wrong = (scen.true_body_index + 1) % 3
tt = core.make_turn_trace(
scen, game, turn_idx=1, observation="obs",
action="stay", raw_text=f"SELF: {wrong}\nACTION: stay",
)
assert tt.self_belief == wrong and tt.self_correct is False
def test_make_turn_trace_no_self_report_is_none():
scen, game = _errand()
tt = core.make_turn_trace(scen, game, turn_idx=1, observation="o",
action="stay", raw_text="ACTION: stay")
assert tt.self_belief is None and tt.self_correct is None
def test_observation_lists_scenario_action_set():
scen, game = _errand()
obs = core.build_observation(scen, game, cut_frames=[], turn_idx=1)
assert "interact" in obs # errand_runner exposes interact in Available actions
# template stays at the 5 movement actions (regression)
t = get_scenario("template")()
tgame = MotiveGridGame(t, random.Random(0), Difficulty.EASY, max_steps=10)
tobs = core.build_observation(t, tgame, cut_frames=[], turn_idx=1)
assert "interact" not in tobs
def test_interactive_accepts_interact_for_errand():
from proteus.game.runtime.interactive import InteractiveSession
s = InteractiveSession("errand_runner", seed=7, play_turns=5, use_default_memory=False)
assert "interact" in s.state()["actions"]
s.step("interact") # must not raise "invalid action"
def test_full_session_runs_and_emits_discovery_metric():
"""End-to-end SessionRunner with a fake provider that always reports the
correct body index; confirms discovery_turn and discovery_identified are
emitted on the resulting SessionTrace.metrics."""
from proteus.providers.fake import FakeProvider # deterministic, no network
from proteus.game.agents.vanilla import VanillaAgent
from proteus.game.runtime.session import SessionRunner
# Confirm the true_body_index for seed 7 (standalone and runner use same RNG).
scen = get_scenario("errand_runner")()
MotiveGridGame(scen, random.Random(7), Difficulty.EASY, max_steps=10)
true = scen.true_body_index # == 1 for seed 7
# FakeProvider.responses repeats last entry once exhausted, so a single
# response is sufficient for any play_turns count.
provider = FakeProvider(responses=[f"SELF: {true}\nACTION: stay"])
agent = VanillaAgent(provider)
runner = SessionRunner(
"errand_runner", agent, seed=7, play_turns=4,
use_probe=False, motive_category="errand",
)
trace = runner.run()
assert trace.scenario == "errand_runner"
assert "discovery_turn" in trace.metrics, (
f"discovery_turn missing from metrics: {trace.metrics}"
)
# Every turn reported the correct body index -> identified at turn 1.
assert trace.metrics["discovery_identified"] == 100.0, (
f"Expected 100.0 but got {trace.metrics['discovery_identified']}"
)
def test_errand_has_no_move_limit():
# With no turn limit, exhausting play_turns does NOT end the errand session.
from proteus.game.runtime.interactive import InteractiveSession
s = InteractiveSession("errand_runner", seed=7, play_turns=2, use_default_memory=False)
for _ in range(6): # well past play_turns=2
s.step("stay")
st = s.state()
assert st["phase"] != "done" and st["review"] is None # still playing, no curfew
assert st["turns_left"] is None and st["play_turns"] is None # unlimited -> no countdown
def test_reaching_house_triggers_analysis():
# The analysis (review) appears when the focal reaches the house goal.
from proteus.game.runtime.interactive import InteractiveSession
from proteus.game.scenarios import errand_world as w
s = InteractiveSession("errand_runner", seed=7, play_turns=2, use_default_memory=False)
s._game.focal_sprite.set_position(*w.home_target_anchor(w.GAME_LAYOUT))
s.step("stay") # engine sees check_success -> win -> done
st = s.state()
assert st["outcome"] == "survived" and st["review"] is not None
assert "errand" in st["review"]
def test_review_has_errand_summary_when_done():
from proteus.game.runtime.interactive import InteractiveSession
from proteus.game.scenarios import errand_world as w
s = InteractiveSession("errand_runner", seed=7, play_turns=2, use_default_memory=False)
s._game.focal_sprite.set_position(*w.home_target_anchor(w.GAME_LAYOUT))
s.step("stay") # reach the house -> done
rv = s.state()["review"]
assert rv is not None and "errand" in rv
es = rv["errand"]
assert set(es["events"]) == {"crosswalk", "construction", "wallet", "pedestrian", "grass"}
assert es["events"]["grass"] in {"cut", "avoid"}
assert isinstance(es["grass_steps"], int)
assert isinstance(es["touched_pedestrian"], bool)
assert "grass" not in es["approximate"] # grass is precisely tracked, not approximate
assert es["closest_persona"] in {"civic", "warm_outlaw", "opportunist"}
assert {"discovery", "generalizability", "coherence"} <= set(es["headline"])