| import json |
|
|
| from amongus_env.eval_suite import evaluate_trace, main, run_eval_suite |
| from amongus_env.golden_episode import run_golden_episode |
|
|
|
|
| def test_eval_suite_scores_current_golden_trace() -> None: |
| result = evaluate_trace(run_golden_episode()) |
|
|
| assert result["schema_version"] == 1 |
| assert result["ok"] is True |
| assert result["episode"] == "golden_false_alibi" |
| assert result["summary"]["steps"] == 6 |
| assert result["summary"]["total_reward"] == -1.3 |
| assert [check["id"] for check in result["checks"]] == [ |
| "labels_sequence", |
| "task_reward", |
| "meeting_protocol", |
| "false_alibi_penalty", |
| "bot_vote_ejects_false_claimant", |
| ] |
| assert all(check["ok"] for check in result["checks"]) |
|
|
|
|
| def test_eval_suite_runs_multiple_baseline_scenarios() -> None: |
| result = run_eval_suite() |
|
|
| assert result["schema_version"] == 1 |
| assert result["ok"] is True |
| assert result["summary"] == { |
| "scenarios": 6, |
| "passed": 6, |
| "failed": 0, |
| } |
| assert [scenario["episode"] for scenario in result["scenarios"]] == [ |
| "golden_false_alibi", |
| "invalid_move_no_state_change", |
| "crewmate_task_route", |
| "meeting_pass_no_majority", |
| "impostor_parity_win", |
| "kill_cooldown_blocks_second_kill", |
| ] |
|
|
|
|
| def test_eval_suite_fails_mutated_trace() -> None: |
| trace = run_golden_episode() |
| trace[4]["observation"]["reward"] = 0.0 |
|
|
| result = evaluate_trace(trace) |
| failed_checks = { |
| check["id"] for check in result["checks"] if check["ok"] is False |
| } |
|
|
| assert result["ok"] is False |
| assert "false_alibi_penalty" in failed_checks |
|
|
|
|
| def test_eval_suite_fails_each_named_check_when_trace_is_mutated() -> None: |
| mutations = { |
| "labels_sequence": lambda trace: trace[0].update({"label": "wrong"}), |
| "task_reward": lambda trace: trace[2]["observation"].update({"reward": 0.0}), |
| "meeting_protocol": lambda trace: trace[3]["observation"].update( |
| {"voting_open": True} |
| ), |
| "bot_vote_ejects_false_claimant": lambda trace: trace[5]["observation"].update( |
| {"message_log": ["No majority; nobody ejected"]} |
| ), |
| } |
|
|
| for check_id, mutate in mutations.items(): |
| trace = run_golden_episode() |
| mutate(trace) |
|
|
| result = evaluate_trace(trace) |
| failed_checks = { |
| check["id"] for check in result["checks"] if check["ok"] is False |
| } |
|
|
| assert result["ok"] is False |
| assert check_id in failed_checks |
|
|
|
|
| def test_eval_suite_malformed_task_trace_fails_without_raising() -> None: |
| trace = run_golden_episode() |
| trace[2]["observation"]["task_list"] = [] |
|
|
| result = evaluate_trace(trace) |
| task_reward = next( |
| check for check in result["checks"] if check["id"] == "task_reward" |
| ) |
|
|
| assert result["ok"] is False |
| assert task_reward["ok"] is False |
|
|
|
|
| def test_eval_suite_cli_prints_valid_json(capsys) -> None: |
| main() |
|
|
| result = json.loads(capsys.readouterr().out) |
|
|
| assert result["ok"] is True |
| assert result["scenarios"] |
|
|