import pytest from eval.public_benchmark_package import build_public_eval_protocol, build_target_training_spec from eval.run_public_benchmark_package import summarize_public_benchmark_package def _successes(num_success: int, total: int = 100) -> list[int]: return [1] * num_success + [0] * (total - num_success) def _target_record(track_id: str, adapter_mode: str, seed: int, num_success: int, *, intervention: float, non_base: float) -> dict: successes = _successes(num_success) record = { "track_id": track_id, "adapter_mode": adapter_mode, "successes": successes, "success_rate": sum(successes) / len(successes), "episodes": len(successes), "seed": seed, "eval_protocol": build_public_eval_protocol(track_id=track_id, eval_mode=adapter_mode, seed=seed, episodes=len(successes)), "intervention_rate": intervention, "non_base_selection_rate": non_base, "steps_to_first_reveal_or_access": 8.0, "steps_to_retrieve": 22.0, "disturbance_proxy": 0.3, } if adapter_mode != "adapter_noop": record["train_spec"] = build_target_training_spec(track_id=track_id, model_variant=adapter_mode, seed=seed) else: record["train_spec"] = build_target_training_spec(track_id=track_id, model_variant="adapter_active_ft", seed=seed) return record def _anchor_record(adapter_mode: str, seed: int, num_success: int) -> dict: successes = _successes(num_success) return { "track_id": "anchor_track", "adapter_mode": adapter_mode, "successes": successes, "success_rate": sum(successes) / len(successes), "episodes": len(successes), "seed": seed, "eval_protocol": build_public_eval_protocol(track_id="anchor_track", eval_mode=adapter_mode, seed=seed, episodes=len(successes)), } def test_public_benchmark_package_summary_passes_with_clear_gain(): payloads = [ _target_record("bag_track", "trunk_only_ft", 17, 35, intervention=0.0, non_base=0.0), _target_record("bag_track", "adapter_noop", 17, 35, intervention=0.0, non_base=0.0), _target_record("bag_track", "adapter_active_ft", 17, 75, intervention=0.30, non_base=0.40), _target_record("occlusion_track", "trunk_only_ft", 17, 30, intervention=0.0, non_base=0.0), _target_record("occlusion_track", "adapter_noop", 17, 30, intervention=0.0, non_base=0.0), _target_record("occlusion_track", "adapter_active_ft", 17, 68, intervention=0.24, non_base=0.22), _target_record("cloth_track", "trunk_only_ft", 17, 28, intervention=0.0, non_base=0.0), _target_record("cloth_track", "adapter_noop", 17, 28, intervention=0.0, non_base=0.0), _target_record("cloth_track", "adapter_active_ft", 17, 60, intervention=0.18, non_base=0.20), _anchor_record("trunk_only", 17, 96), _anchor_record("adapter_noop", 17, 96), _anchor_record("adapter_active", 17, 95), ] summary = summarize_public_benchmark_package(payloads, bootstrap_samples=200, bootstrap_seed=0) assert summary["headline_pass"] assert summary["sign_of_life_pass"] assert summary["anchor_pass"] assert summary["sign_of_life_track_count"] == 3 assert summary["tracks"]["bag_track"]["delta_active_vs_trunk"] > 0.0 assert summary["tracks"]["anchor_track"]["anchor_within_tolerance"] def test_public_benchmark_package_detects_training_mismatch(): payloads = [ _target_record("bag_track", "trunk_only_ft", 17, 35, intervention=0.0, non_base=0.0), _target_record("bag_track", "adapter_noop", 17, 35, intervention=0.0, non_base=0.0), _target_record("bag_track", "adapter_active_ft", 17, 75, intervention=0.30, non_base=0.40), _target_record("occlusion_track", "trunk_only_ft", 17, 30, intervention=0.0, non_base=0.0), _target_record("occlusion_track", "adapter_noop", 17, 30, intervention=0.0, non_base=0.0), _target_record("occlusion_track", "adapter_active_ft", 17, 68, intervention=0.24, non_base=0.22), _target_record("cloth_track", "trunk_only_ft", 17, 28, intervention=0.0, non_base=0.0), _target_record("cloth_track", "adapter_noop", 17, 28, intervention=0.0, non_base=0.0), _target_record("cloth_track", "adapter_active_ft", 17, 60, intervention=0.18, non_base=0.20), _anchor_record("trunk_only", 17, 96), _anchor_record("adapter_noop", 17, 96), _anchor_record("adapter_active", 17, 95), ] payloads[8]["train_spec"]["batch_size"] = 64 with pytest.raises(ValueError, match="Training fairness mismatch"): summarize_public_benchmark_package(payloads, bootstrap_samples=50, bootstrap_seed=0)