Spaces:
Sleeping
Sleeping
| """Unit tests for the scripted-policy benchmark runner. | |
| The runner drives a fixed set of non-learning policies through the full | |
| environment without LLM calls. These tests pin: | |
| 1. Each policy returns valid action or None offline. | |
| 2. The aggregator produces per-policy means and a discrimination delta. | |
| 3. The headline policy sweep keeps the heuristic ≥ 0.40 above the naive floor. | |
| """ | |
| from __future__ import annotations | |
| from runners.benchmark_runner import ( | |
| POLICY_NAMES, | |
| POLICY_REGISTRY, | |
| concede_all_policy, | |
| escalate_all_policy, | |
| heuristic_policy, | |
| naive_policy, | |
| run_multi_seed, | |
| run_policy_on_task, | |
| run_policy_sweep, | |
| ) | |
| from scenarios.simulation import get_task | |
| _EASY_TASK = get_task("goods_not_received_easy") | |
| def test_policy_registry_matches_public_names(): | |
| assert set(POLICY_NAMES) == set(POLICY_REGISTRY) | |
| assert set(POLICY_NAMES) == {"heuristic", "escalate_all", "concede_all", "naive"} | |
| def test_heuristic_scores_above_naive_on_easy(): | |
| heur = run_policy_on_task(heuristic_policy, _EASY_TASK) | |
| nv = run_policy_on_task(naive_policy, _EASY_TASK) | |
| assert heur.score > nv.score | |
| assert heur.task_id == _EASY_TASK.task_id | |
| assert heur.steps_used > 0 | |
| def test_concede_all_lands_final_resolution(): | |
| """concede_all must always terminate the episode with a concede path.""" | |
| result = run_policy_on_task(concede_all_policy, _EASY_TASK) | |
| assert result.steps_used > 0 | |
| # concede_all scores strictly below heuristic but must stay in [0, 1]. | |
| assert 0.0 <= result.score <= 1.0 | |
| def test_escalate_all_runs_to_completion(): | |
| result = run_policy_on_task(escalate_all_policy, _EASY_TASK) | |
| assert 0.0 <= result.score <= 1.0 | |
| assert result.steps_used > 0 | |
| def test_sweep_aggregates_and_produces_delta(): | |
| result = run_policy_sweep() | |
| policies = {summary.policy: summary for summary in result.policies} | |
| assert set(policies) == set(POLICY_NAMES) | |
| # mean scores sit in the valid range | |
| for summary in result.policies: | |
| assert 0.0 <= summary.mean_score <= 1.0 | |
| # discrimination delta is heuristic - naive and must clear the PRD bar. | |
| assert result.discrimination_delta >= 0.40 | |
| def test_sweep_is_deterministic(): | |
| """Two runs on the same catalog must produce identical numbers.""" | |
| first = run_policy_sweep().to_dict() | |
| second = run_policy_sweep().to_dict() | |
| assert first == second | |
| def test_multi_seed_sweep_runs_subset(): | |
| """Tiny grid (2 seeds × 1 difficulty) stays under a second and returns data.""" | |
| result = run_multi_seed(seeds=[42, 17], difficulties=["easy"]) | |
| for summary in result.policies: | |
| assert len(summary.tasks) == 2 | |
| for task_score in summary.tasks: | |
| assert task_score.task_id.startswith("generated_easy_s") | |