| """Unit tests for the parsing and scoring helpers in inference.py. |
| |
| These guard the two-line action-format contract that the trained policy |
| emits and the reward function consumes. |
| """ |
| from __future__ import annotations |
|
|
| import importlib.util |
| import os |
| import sys |
|
|
| ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) |
| spec = importlib.util.spec_from_file_location("inference", os.path.join(ROOT, "inference.py")) |
| inf = importlib.util.module_from_spec(spec) |
| sys.modules["inference"] = inf |
| spec.loader.exec_module(inf) |
|
|
| OPTIONS = ["full cooperation", "limited disclosure", "seek legal delay"] |
|
|
|
|
| def test_parses_well_formed_two_line_completion(): |
| completion = "DECISION: full cooperation\nPITCH: A clean record protects long-term reputation." |
| decision, pitch, ok = inf.parse_completion(completion, OPTIONS) |
| assert decision == "full cooperation" |
| assert "clean record" in pitch |
| assert ok is True |
|
|
|
|
| def test_falls_back_to_substring_match_when_format_partially_broken(): |
| completion = "I think we should go with limited disclosure to manage risk." |
| decision, _, ok = inf.parse_completion(completion, OPTIONS) |
| assert decision == "limited disclosure" |
| assert ok is False |
|
|
|
|
| def test_returns_first_option_when_completion_unintelligible(): |
| completion = "blah blah blah" |
| decision, pitch, ok = inf.parse_completion(completion, OPTIONS) |
| assert decision == OPTIONS[0] |
| assert pitch == "" |
| assert ok is False |
|
|
|
|
| def test_pitch_clipped_to_first_line(): |
| completion = "DECISION: seek legal delay\nPITCH: buying time preserves runway.\nExtra trailing line." |
| _, pitch, ok = inf.parse_completion(completion, OPTIONS) |
| assert "buying time preserves runway." in pitch |
| assert "Extra trailing line" not in pitch |
| assert ok is True |
|
|
|
|
| def test_keyword_pitch_score_rewards_role_alignment(): |
| cfo_text = "runway discipline and burn control protect compliance" |
| cto_text = "engineering quality and team morale make velocity sustainable" |
| assert inf.keyword_pitch_score(cfo_text, "CFO") >= 0.5 |
| assert inf.keyword_pitch_score(cto_text, "CTO") >= 0.5 |
| assert inf.keyword_pitch_score("totally unrelated text about cats", "CFO") == 0.0 |
|
|
|
|
| def test_keyword_pitch_score_zero_on_empty_pitch(): |
| assert inf.keyword_pitch_score("", "CTO") == 0.0 |
| assert inf.keyword_pitch_score(" ", "CFO") == 0.0 |
|
|
|
|
| def test_random_policy_act_returns_valid_option(): |
| class FakeObs: |
| options = OPTIONS |
| npc_statements = [] |
| pol = inf.RandomPolicy() |
| decision, pitch, ok = pol.act(FakeObs()) |
| assert decision in OPTIONS |
| assert pitch == "" |
| assert ok is False |
|
|
|
|
| def test_run_summary_aggregates_metrics_correctly(): |
| eps = [ |
| inf.EpisodeMetrics(seed=1, total_reward=2.0, final_profitability=50.0, |
| survived=True, votes_won=7, votes_total=10, |
| pitches_written=4, avg_pitch_score=0.3), |
| inf.EpisodeMetrics(seed=2, total_reward=4.0, final_profitability=70.0, |
| survived=True, votes_won=9, votes_total=10, |
| pitches_written=8, avg_pitch_score=0.5), |
| ] |
| s = inf.summarise("trained-test", eps) |
| assert s.policy == "trained-test" |
| assert s.n_episodes == 2 |
| assert abs(s.mean_reward - 3.0) < 1e-6 |
| assert abs(s.mean_profitability - 60.0) < 1e-6 |
| assert s.survival_rate == 1.0 |
| assert abs(s.win_rate_per_round - 0.8) < 1e-6 |
| assert abs(s.pitch_usage_rate - 0.6) < 1e-6 |
| assert abs(s.mean_pitch_score - 0.4) < 1e-6 |
|
|