AgentnessBench / tests /runtime /test_aggregate.py
irregular6612's picture
refactor(scenario): delete predator_evade; template is the canonical scenario
93cd78f
Raw
History Blame Contribute Delete
1.21 kB
from proteus.game.agents import VanillaAgent
from proteus.providers import FakeProvider
from proteus.game.metrics.aggregate import aggregate_traces
from proteus.game.runtime.session import SessionRunner
def _trace(seed):
agent = VanillaAgent(FakeProvider(["ACTION: up"]))
return SessionRunner(
"template", agent, seed=seed, play_turns=4, use_probe=False,
).run()
def test_aggregate_groups_by_model_and_difficulty():
traces = [_trace(1), _trace(2)]
groups = aggregate_traces(traces)
# One group: (model="fake", difficulty="easy").
assert ("fake", "easy") in groups
g = groups[("fake", "easy")]
assert g["n"] == 2
# Every metric key is aggregated to a mean (float).
assert "motive_reading_accuracy" in g["metrics"]
assert isinstance(g["metrics"]["motive_reading_accuracy"], float)
def test_aggregate_means_are_correct():
traces = [_trace(1), _trace(2)]
groups = aggregate_traces(traces)
g = groups[("fake", "easy")]
expected = sum(t.metrics["mean_step_reward"] for t in traces) / 2
assert abs(g["metrics"]["mean_step_reward"] - expected) < 1e-9
def test_aggregate_empty_returns_empty():
assert aggregate_traces([]) == {}