Spaces:

mitudrudutta
/

ChargeBackOps

Sleeping

App Files Files Community

ChargeBackOps / tests /test_benchmark_runner.py

pauldebanshu19

Add training notebook and benchmark runner for ChargebackOps

bd00c06 about 2 months ago

raw

history blame contribute delete

2.78 kB

	"""Unit tests for the scripted-policy benchmark runner.

	The runner drives a fixed set of non-learning policies through the full
	environment without LLM calls. These tests pin:

	1. Each policy returns valid action or None offline.
	2. The aggregator produces per-policy means and a discrimination delta.
	3. The headline policy sweep keeps the heuristic ≥ 0.40 above the naive floor.
	"""

	from __future__ import annotations

	from runners.benchmark_runner import (
	POLICY_NAMES,
	POLICY_REGISTRY,
	concede_all_policy,
	escalate_all_policy,
	heuristic_policy,
	naive_policy,
	run_multi_seed,
	run_policy_on_task,
	run_policy_sweep,
	)
	from scenarios.simulation import get_task


	_EASY_TASK = get_task("goods_not_received_easy")


	def test_policy_registry_matches_public_names():
	assert set(POLICY_NAMES) == set(POLICY_REGISTRY)
	assert set(POLICY_NAMES) == {"heuristic", "escalate_all", "concede_all", "naive"}


	def test_heuristic_scores_above_naive_on_easy():
	heur = run_policy_on_task(heuristic_policy, _EASY_TASK)
	nv = run_policy_on_task(naive_policy, _EASY_TASK)
	assert heur.score > nv.score
	assert heur.task_id == _EASY_TASK.task_id
	assert heur.steps_used > 0


	def test_concede_all_lands_final_resolution():
	"""concede_all must always terminate the episode with a concede path."""
	result = run_policy_on_task(concede_all_policy, _EASY_TASK)
	assert result.steps_used > 0
	# concede_all scores strictly below heuristic but must stay in [0, 1].
	assert 0.0 <= result.score <= 1.0


	def test_escalate_all_runs_to_completion():
	result = run_policy_on_task(escalate_all_policy, _EASY_TASK)
	assert 0.0 <= result.score <= 1.0
	assert result.steps_used > 0


	def test_sweep_aggregates_and_produces_delta():
	result = run_policy_sweep()
	policies = {summary.policy: summary for summary in result.policies}
	assert set(policies) == set(POLICY_NAMES)
	# mean scores sit in the valid range
	for summary in result.policies:
	assert 0.0 <= summary.mean_score <= 1.0
	# discrimination delta is heuristic - naive and must clear the PRD bar.
	assert result.discrimination_delta >= 0.40


	def test_sweep_is_deterministic():
	"""Two runs on the same catalog must produce identical numbers."""
	first = run_policy_sweep().to_dict()
	second = run_policy_sweep().to_dict()
	assert first == second


	def test_multi_seed_sweep_runs_subset():
	"""Tiny grid (2 seeds × 1 difficulty) stays under a second and returns data."""
	result = run_multi_seed(seeds=[42, 17], difficulties=["easy"])
	for summary in result.policies:
	assert len(summary.tasks) == 2
	for task_score in summary.tasks:
	assert task_score.task_id.startswith("generated_easy_s")