File size: 2,782 Bytes
bd00c06
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
"""Unit tests for the scripted-policy benchmark runner.

The runner drives a fixed set of non-learning policies through the full
environment without LLM calls. These tests pin:

1. Each policy returns valid action or None offline.
2. The aggregator produces per-policy means and a discrimination delta.
3. The headline policy sweep keeps the heuristic ≥ 0.40 above the naive floor.
"""

from __future__ import annotations

from runners.benchmark_runner import (
    POLICY_NAMES,
    POLICY_REGISTRY,
    concede_all_policy,
    escalate_all_policy,
    heuristic_policy,
    naive_policy,
    run_multi_seed,
    run_policy_on_task,
    run_policy_sweep,
)
from scenarios.simulation import get_task


_EASY_TASK = get_task("goods_not_received_easy")


def test_policy_registry_matches_public_names():
    assert set(POLICY_NAMES) == set(POLICY_REGISTRY)
    assert set(POLICY_NAMES) == {"heuristic", "escalate_all", "concede_all", "naive"}


def test_heuristic_scores_above_naive_on_easy():
    heur = run_policy_on_task(heuristic_policy, _EASY_TASK)
    nv = run_policy_on_task(naive_policy, _EASY_TASK)
    assert heur.score > nv.score
    assert heur.task_id == _EASY_TASK.task_id
    assert heur.steps_used > 0


def test_concede_all_lands_final_resolution():
    """concede_all must always terminate the episode with a concede path."""
    result = run_policy_on_task(concede_all_policy, _EASY_TASK)
    assert result.steps_used > 0
    # concede_all scores strictly below heuristic but must stay in [0, 1].
    assert 0.0 <= result.score <= 1.0


def test_escalate_all_runs_to_completion():
    result = run_policy_on_task(escalate_all_policy, _EASY_TASK)
    assert 0.0 <= result.score <= 1.0
    assert result.steps_used > 0


def test_sweep_aggregates_and_produces_delta():
    result = run_policy_sweep()
    policies = {summary.policy: summary for summary in result.policies}
    assert set(policies) == set(POLICY_NAMES)
    # mean scores sit in the valid range
    for summary in result.policies:
        assert 0.0 <= summary.mean_score <= 1.0
    # discrimination delta is heuristic - naive and must clear the PRD bar.
    assert result.discrimination_delta >= 0.40


def test_sweep_is_deterministic():
    """Two runs on the same catalog must produce identical numbers."""
    first = run_policy_sweep().to_dict()
    second = run_policy_sweep().to_dict()
    assert first == second


def test_multi_seed_sweep_runs_subset():
    """Tiny grid (2 seeds × 1 difficulty) stays under a second and returns data."""
    result = run_multi_seed(seeds=[42, 17], difficulties=["easy"])
    for summary in result.policies:
        assert len(summary.tasks) == 2
        for task_score in summary.tasks:
            assert task_score.task_id.startswith("generated_easy_s")