AgentnessBench / tests /runtime /test_metrics.py
irregular6612's picture
refactor: restructure proteus into game/web subpackages
426093b
Raw
History Blame Contribute Delete
6.49 kB
from proteus.game.runtime.trace import TurnTrace, SessionTrace
from proteus.game.metrics.metrics import compute_metrics
def _turn(idx, action, motive, habit, reward, focal_pos=(0, 0)):
return TurnTrace(
turn_idx=idx, observation="", action=action, motive_action=motive,
habit_action=habit, is_diagnostic=(motive != habit),
was_congruent=(action == motive), reward=reward,
focal_pos=focal_pos, predator_pos=(1, 1),
)
def test_metrics_on_a_mixed_session():
turns = [
_turn(1, "up", "up", "left", 5.0), # diagnostic, congruent
_turn(2, "left", "up", "left", -2.0), # diagnostic, NOT congruent
_turn(3, "stay", "stay", "stay", -2.0),# non-diagnostic, congruent
]
m = compute_metrics(turns, played_turns=3, play_turns=10, outcome="eliminated")
# motive-reading accuracy: 2/3 congruent
assert round(m["motive_reading_accuracy"], 1) == 66.7
# reactivity: of 2 diagnostic turns, 1 congruent -> 50.0
assert m["reactivity_index"] == 50.0
# survival fraction: 3/10 -> 30.0
assert m["survival_fraction"] == 30.0
# first divergence from motive_action is turn 2
assert m["first_divergence_turn"] == 2.0
def test_metrics_no_diagnostic_turns_is_zero_reactivity():
turns = [_turn(1, "stay", "stay", "stay", 0.0)]
m = compute_metrics(turns, played_turns=1, play_turns=10, outcome="eliminated")
assert m["reactivity_index"] == 0.0
def test_metrics_empty_session():
m = compute_metrics([], played_turns=0, play_turns=10, outcome="eliminated")
assert m["motive_reading_accuracy"] == 0.0
assert m["first_divergence_turn"] == 0.0
def test_perfect_agent_never_diverges_first_divergence_zero():
# All turns congruent → agent never diverged from the motive action.
turns = [
_turn(1, "up", "up", "left", 5.0),
_turn(2, "right", "right", "down", 5.0),
]
m = compute_metrics(turns, played_turns=2, play_turns=10, outcome="survived")
assert m["motive_reading_accuracy"] == 100.0
assert m["first_divergence_turn"] == 0.0 # never diverged
def test_all_diagnostic_all_congruent_full_reactivity():
turns = [
_turn(1, "up", "up", "left", 5.0), # diagnostic + congruent
_turn(2, "right", "right", "down", 5.0), # diagnostic + congruent
]
m = compute_metrics(turns, played_turns=2, play_turns=10, outcome="survived")
assert m["reactivity_index"] == 100.0
def test_survival_fraction_capped_at_100():
turns = [_turn(1, "up", "up", "left", 5.0)]
# played_turns exceeds play_turns (defensive) → capped, not >100.
m = compute_metrics(turns, played_turns=15, play_turns=10, outcome="survived")
assert m["survival_fraction"] == 100.0
def test_empty_session_all_keys_zero():
m = compute_metrics([], played_turns=0, play_turns=10, outcome="eliminated")
assert m == {
"motive_reading_accuracy": 0.0,
"reactivity_index": 0.0,
"survival_fraction": 0.0,
"first_divergence_turn": 0.0,
"away_move_fraction": 0.0,
"mean_step_reward": 0.0,
"trajectory_agreement": 0.0,
"final_distance_gap": 0.0,
"time_to_capture": 0.0,
"distance_auc": 0.0,
"min_distance": 0.0,
"near_capture_count": 0.0,
}
def test_away_move_fraction_and_mean_reward():
turns = [
_turn(1, "up", "up", "left", 1.0, (3, 3)), # away (reward > 0)
_turn(2, "right", "up", "left", -1.0, (3, 2)), # toward (reward < 0)
_turn(3, "up", "up", "left", 2.0, (4, 2)), # away
]
m = compute_metrics(turns, played_turns=3, play_turns=5, outcome="eliminated")
assert m["away_move_fraction"] == 2 / 3 * 100.0
assert m["mean_step_reward"] == (1.0 - 1.0 + 2.0) / 3
def test_trajectory_agreement_and_final_gap():
turns = [
_turn(1, "up", "up", "left", 1.0, (3, 3)),
_turn(2, "up", "up", "left", 1.0, (3, 2)),
]
# Optimal pre-move positions: turn1 matches (3,3), turn2 differs (4,2) vs (3,2).
m = compute_metrics(
turns, played_turns=2, play_turns=5, outcome="eliminated",
optimal_focal_positions=[(3, 3), (4, 2)],
realized_final_safety=2, optimal_final_safety=4,
)
assert m["trajectory_agreement"] == 1 / 2 * 100.0
assert m["final_distance_gap"] == 2.0
def test_new_metric_keys_default_zero_without_rollout():
turns = [_turn(1, "up", "up", "left", 1.0, (3, 3))]
m = compute_metrics(turns, played_turns=1, play_turns=5, outcome="survived")
# Trajectory keys default to 0.0 when no rollout data is supplied.
assert m["trajectory_agreement"] == 0.0
assert m["final_distance_gap"] == 0.0
assert set(m) >= {
"motive_reading_accuracy", "reactivity_index", "survival_fraction",
"first_divergence_turn", "away_move_fraction", "mean_step_reward",
"trajectory_agreement", "final_distance_gap",
}
def test_trajectory_agreement_penalizes_turns_beyond_optimal():
# Realized ran longer than the optimal rollout: the extra turn has no
# optimal counterpart and counts as a disagreement (denominator = n).
turns = [
_turn(1, "up", "up", "left", 1.0, (3, 3)),
_turn(2, "up", "up", "left", 1.0, (3, 2)),
_turn(3, "up", "up", "left", 1.0, (3, 1)),
]
m = compute_metrics(
turns, played_turns=3, play_turns=5, outcome="survived",
optimal_focal_positions=[(3, 3), (3, 2)], # only 2 optimal positions
)
# turns 1-2 match optimal -> 2 agree; turn 3 has no optimal counterpart.
# agreement = 2/3 (divided by n=3, NOT by compared=2).
assert m["trajectory_agreement"] == 2 / 3 * 100.0
def test_distance_metrics_present_and_sane():
from proteus.game.metrics.metrics import compute_metrics
from proteus.game.runtime.trace import TurnTrace
turns = [
TurnTrace(turn_idx=i, observation="", action="up", motive_action="up",
habit_action="left", is_diagnostic=True, was_congruent=True,
reward=1.0, focal_pos=(3, 3), predator_pos=(5, 3),
post_bfs_distance=d)
for i, d in enumerate([3, 2, 1], start=1)
]
m = compute_metrics(turns, played_turns=3, play_turns=5, outcome="eliminated",
max_bfs_distance=6)
assert m["time_to_capture"] == 3 # eliminated -> played_turns
assert m["min_distance"] == 1.0
assert m["near_capture_count"] == 1.0
assert 0.0 < m["distance_auc"] <= 1.0