from eval.public_benchmark_package import ( ANCHOR_ROLE, TARGET_ROLE, build_public_eval_protocol, build_target_training_spec, default_public_benchmark_manifest, expected_eval_modes, public_benchmark_tracks, public_protocol_identity_signature, training_fairness_signature, ) def test_public_benchmark_package_contains_expected_tracks(): manifest = default_public_benchmark_manifest() assert manifest["target_track_ids"] == ["bag_track", "occlusion_track", "cloth_track"] assert manifest["anchor_track_ids"] == ["anchor_track"] assert manifest["thresholds"]["anchor_tolerance"] == 0.02 def test_public_target_protocol_identity_is_mode_invariant(): protocol_signatures = { public_protocol_identity_signature( build_public_eval_protocol(track_id="bag_track", eval_mode=mode, seed=17) ) for mode in expected_eval_modes("bag_track") } assert len(protocol_signatures) == 1 def test_public_anchor_protocol_identity_is_mode_invariant(): protocol_signatures = { public_protocol_identity_signature( build_public_eval_protocol(track_id="anchor_track", eval_mode=mode, seed=17) ) for mode in expected_eval_modes("anchor_track") } assert len(protocol_signatures) == 1 def test_training_fairness_signature_matches_for_trunk_and_adapter(): trunk = build_target_training_spec(track_id="cloth_track", model_variant="trunk_only_ft", seed=17) active = build_target_training_spec(track_id="cloth_track", model_variant="adapter_active_ft", seed=17) assert training_fairness_signature(trunk) == training_fairness_signature(active) def test_public_track_roles_are_partitioned(): target_roles = {track.track_id: track.role for track in public_benchmark_tracks(TARGET_ROLE)} anchor_roles = {track.track_id: track.role for track in public_benchmark_tracks(ANCHOR_ROLE)} assert target_roles == { "bag_track": TARGET_ROLE, "occlusion_track": TARGET_ROLE, "cloth_track": TARGET_ROLE, } assert anchor_roles == {"anchor_track": ANCHOR_ROLE}