File size: 4,731 Bytes
b3a756d | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 | import pytest
from eval.public_benchmark_package import build_public_eval_protocol, build_target_training_spec
from eval.run_public_benchmark_package import summarize_public_benchmark_package
def _successes(num_success: int, total: int = 100) -> list[int]:
return [1] * num_success + [0] * (total - num_success)
def _target_record(track_id: str, adapter_mode: str, seed: int, num_success: int, *, intervention: float, non_base: float) -> dict:
successes = _successes(num_success)
record = {
"track_id": track_id,
"adapter_mode": adapter_mode,
"successes": successes,
"success_rate": sum(successes) / len(successes),
"episodes": len(successes),
"seed": seed,
"eval_protocol": build_public_eval_protocol(track_id=track_id, eval_mode=adapter_mode, seed=seed, episodes=len(successes)),
"intervention_rate": intervention,
"non_base_selection_rate": non_base,
"steps_to_first_reveal_or_access": 8.0,
"steps_to_retrieve": 22.0,
"disturbance_proxy": 0.3,
}
if adapter_mode != "adapter_noop":
record["train_spec"] = build_target_training_spec(track_id=track_id, model_variant=adapter_mode, seed=seed)
else:
record["train_spec"] = build_target_training_spec(track_id=track_id, model_variant="adapter_active_ft", seed=seed)
return record
def _anchor_record(adapter_mode: str, seed: int, num_success: int) -> dict:
successes = _successes(num_success)
return {
"track_id": "anchor_track",
"adapter_mode": adapter_mode,
"successes": successes,
"success_rate": sum(successes) / len(successes),
"episodes": len(successes),
"seed": seed,
"eval_protocol": build_public_eval_protocol(track_id="anchor_track", eval_mode=adapter_mode, seed=seed, episodes=len(successes)),
}
def test_public_benchmark_package_summary_passes_with_clear_gain():
payloads = [
_target_record("bag_track", "trunk_only_ft", 17, 35, intervention=0.0, non_base=0.0),
_target_record("bag_track", "adapter_noop", 17, 35, intervention=0.0, non_base=0.0),
_target_record("bag_track", "adapter_active_ft", 17, 75, intervention=0.30, non_base=0.40),
_target_record("occlusion_track", "trunk_only_ft", 17, 30, intervention=0.0, non_base=0.0),
_target_record("occlusion_track", "adapter_noop", 17, 30, intervention=0.0, non_base=0.0),
_target_record("occlusion_track", "adapter_active_ft", 17, 68, intervention=0.24, non_base=0.22),
_target_record("cloth_track", "trunk_only_ft", 17, 28, intervention=0.0, non_base=0.0),
_target_record("cloth_track", "adapter_noop", 17, 28, intervention=0.0, non_base=0.0),
_target_record("cloth_track", "adapter_active_ft", 17, 60, intervention=0.18, non_base=0.20),
_anchor_record("trunk_only", 17, 96),
_anchor_record("adapter_noop", 17, 96),
_anchor_record("adapter_active", 17, 95),
]
summary = summarize_public_benchmark_package(payloads, bootstrap_samples=200, bootstrap_seed=0)
assert summary["headline_pass"]
assert summary["sign_of_life_pass"]
assert summary["anchor_pass"]
assert summary["sign_of_life_track_count"] == 3
assert summary["tracks"]["bag_track"]["delta_active_vs_trunk"] > 0.0
assert summary["tracks"]["anchor_track"]["anchor_within_tolerance"]
def test_public_benchmark_package_detects_training_mismatch():
payloads = [
_target_record("bag_track", "trunk_only_ft", 17, 35, intervention=0.0, non_base=0.0),
_target_record("bag_track", "adapter_noop", 17, 35, intervention=0.0, non_base=0.0),
_target_record("bag_track", "adapter_active_ft", 17, 75, intervention=0.30, non_base=0.40),
_target_record("occlusion_track", "trunk_only_ft", 17, 30, intervention=0.0, non_base=0.0),
_target_record("occlusion_track", "adapter_noop", 17, 30, intervention=0.0, non_base=0.0),
_target_record("occlusion_track", "adapter_active_ft", 17, 68, intervention=0.24, non_base=0.22),
_target_record("cloth_track", "trunk_only_ft", 17, 28, intervention=0.0, non_base=0.0),
_target_record("cloth_track", "adapter_noop", 17, 28, intervention=0.0, non_base=0.0),
_target_record("cloth_track", "adapter_active_ft", 17, 60, intervention=0.18, non_base=0.20),
_anchor_record("trunk_only", 17, 96),
_anchor_record("adapter_noop", 17, 96),
_anchor_record("adapter_active", 17, 95),
]
payloads[8]["train_spec"]["batch_size"] = 64
with pytest.raises(ValueError, match="Training fairness mismatch"):
summarize_public_benchmark_package(payloads, bootstrap_samples=50, bootstrap_seed=0)
|