AmongUS / tests /test_eval_suite.py
5h4dy's picture
Upload folder using huggingface_hub
15e955d verified
import json
from amongus_env.eval_suite import evaluate_trace, main, run_eval_suite
from amongus_env.golden_episode import run_golden_episode
def test_eval_suite_scores_current_golden_trace() -> None:
result = evaluate_trace(run_golden_episode())
assert result["schema_version"] == 1
assert result["ok"] is True
assert result["episode"] == "golden_false_alibi"
assert result["summary"]["steps"] == 6
assert result["summary"]["total_reward"] == -1.3
assert [check["id"] for check in result["checks"]] == [
"labels_sequence",
"task_reward",
"meeting_protocol",
"false_alibi_penalty",
"bot_vote_ejects_false_claimant",
]
assert all(check["ok"] for check in result["checks"])
def test_eval_suite_runs_multiple_baseline_scenarios() -> None:
result = run_eval_suite()
assert result["schema_version"] == 1
assert result["ok"] is True
assert result["summary"] == {
"scenarios": 6,
"passed": 6,
"failed": 0,
}
assert [scenario["episode"] for scenario in result["scenarios"]] == [
"golden_false_alibi",
"invalid_move_no_state_change",
"crewmate_task_route",
"meeting_pass_no_majority",
"impostor_parity_win",
"kill_cooldown_blocks_second_kill",
]
def test_eval_suite_fails_mutated_trace() -> None:
trace = run_golden_episode()
trace[4]["observation"]["reward"] = 0.0
result = evaluate_trace(trace)
failed_checks = {
check["id"] for check in result["checks"] if check["ok"] is False
}
assert result["ok"] is False
assert "false_alibi_penalty" in failed_checks
def test_eval_suite_fails_each_named_check_when_trace_is_mutated() -> None:
mutations = {
"labels_sequence": lambda trace: trace[0].update({"label": "wrong"}),
"task_reward": lambda trace: trace[2]["observation"].update({"reward": 0.0}),
"meeting_protocol": lambda trace: trace[3]["observation"].update(
{"voting_open": True}
),
"bot_vote_ejects_false_claimant": lambda trace: trace[5]["observation"].update(
{"message_log": ["No majority; nobody ejected"]}
),
}
for check_id, mutate in mutations.items():
trace = run_golden_episode()
mutate(trace)
result = evaluate_trace(trace)
failed_checks = {
check["id"] for check in result["checks"] if check["ok"] is False
}
assert result["ok"] is False
assert check_id in failed_checks
def test_eval_suite_malformed_task_trace_fails_without_raising() -> None:
trace = run_golden_episode()
trace[2]["observation"]["task_list"] = []
result = evaluate_trace(trace)
task_reward = next(
check for check in result["checks"] if check["id"] == "task_reward"
)
assert result["ok"] is False
assert task_reward["ok"] is False
def test_eval_suite_cli_prints_valid_json(capsys) -> None:
main()
result = json.loads(capsys.readouterr().out)
assert result["ok"] is True
assert result["scenarios"]