"""Tests for JDG 01–06 scoring functions.""" from __future__ import annotations from replicalab.agents.lab_manager_policy import check_feasibility from replicalab.models import Protocol, RewardBreakdown from replicalab.scenarios import generate_scenario from replicalab.scenarios.templates import AllowedSubstitution, HiddenReferenceSpec from replicalab.scoring import ( build_reward_breakdown, compute_total_reward, explain_reward, score_feasibility, score_fidelity, score_rigor, ) # --------------------------------------------------------------------------- # Helpers # --------------------------------------------------------------------------- def _scenario(template: str = "ml_benchmark", difficulty: str = "easy"): return generate_scenario(seed=42, template=template, difficulty=difficulty) def _good_protocol(scenario) -> Protocol: """Build a well-formed protocol aligned to the scenario.""" lab = scenario.lab_manager_observation spec = scenario.hidden_reference_spec return Protocol( sample_size=10, controls=["baseline", "ablation"], technique=spec.summary[:60] if spec.summary else "replication_plan", duration_days=max(1, min(2, lab.time_limit_days)), required_equipment=( list(lab.equipment_available[:1]) if lab.equipment_available else [] ), required_reagents=( list(lab.reagents_in_stock[:1]) if lab.reagents_in_stock else [] ), rationale=( f"Plan addresses: {', '.join(spec.required_elements[:2])}. " f"Target metric: {spec.target_metric}. " f"Target value: {spec.target_value}. " "Stay within budget and schedule." ), ) def _bad_protocol() -> Protocol: """Build a minimal protocol that misses most requirements.""" return Protocol( sample_size=1, controls=[], technique="unknown_method", duration_days=1, required_equipment=[], required_reagents=[], rationale="No plan.", ) def _awful_protocol(scenario) -> Protocol: """Build a structurally weak and clearly infeasible protocol.""" return Protocol( sample_size=200, controls=[], technique="imaginary_method", duration_days=scenario.lab_manager_observation.time_limit_days + 5, required_equipment=["Imaginary Device"], required_reagents=["Imaginary Reagent"], rationale="No.", ) # --------------------------------------------------------------------------- # JDG 01 — score_rigor # --------------------------------------------------------------------------- def test_rigor_good_protocol_scores_higher_than_bad() -> None: scenario = _scenario("ml_benchmark", "easy") good = _good_protocol(scenario) bad = _bad_protocol() good_score = score_rigor(good, scenario) bad_score = score_rigor(bad, scenario) assert good_score > bad_score assert 0.0 <= good_score <= 1.0 assert 0.0 <= bad_score <= 1.0 def test_rigor_is_deterministic() -> None: scenario = _scenario("ml_benchmark", "medium") protocol = _good_protocol(scenario) first = score_rigor(protocol, scenario) second = score_rigor(protocol, scenario) assert first == second def test_rigor_empty_controls_reduces_score() -> None: scenario = _scenario("math_reasoning", "easy") with_controls = _good_protocol(scenario) without_controls = with_controls.model_copy(update={"controls": ["only_one"]}) score_with = score_rigor(with_controls, scenario) score_without = score_rigor(without_controls, scenario) assert score_with >= score_without def test_rigor_short_rationale_reduces_score() -> None: scenario = _scenario("finance_trading", "easy") good = _good_protocol(scenario) short = good.model_copy(update={"rationale": "OK."}) assert score_rigor(good, scenario) > score_rigor(short, scenario) def test_rigor_all_domains_return_valid_range() -> None: for template in ("ml_benchmark", "math_reasoning", "finance_trading"): for difficulty in ("easy", "medium", "hard"): scenario = generate_scenario(seed=99, template=template, difficulty=difficulty) protocol = _good_protocol(scenario) score = score_rigor(protocol, scenario) assert 0.0 <= score <= 1.0, f"{template}/{difficulty}: {score}" # --------------------------------------------------------------------------- # JDG 02 — score_feasibility # --------------------------------------------------------------------------- def test_feasibility_viable_protocol_scores_high() -> None: scenario = _scenario("ml_benchmark", "easy") protocol = _good_protocol(scenario) score = score_feasibility(protocol, scenario) assert score > 0.7 assert 0.0 <= score <= 1.0 def test_feasibility_infeasible_protocol_scores_lower() -> None: scenario = _scenario("ml_benchmark", "easy") good = _good_protocol(scenario) # Blow the budget and schedule bad = good.model_copy(update={ "sample_size": 200, "duration_days": scenario.lab_manager_observation.time_limit_days + 5, "required_equipment": ["Imaginary Device"], }) good_score = score_feasibility(good, scenario) bad_score = score_feasibility(bad, scenario) assert good_score > bad_score def test_feasibility_accepts_precomputed_check() -> None: scenario = _scenario("finance_trading", "easy") protocol = _good_protocol(scenario) check = check_feasibility(protocol, scenario) score_with = score_feasibility(protocol, scenario, check=check) score_without = score_feasibility(protocol, scenario) assert score_with == score_without def test_feasibility_is_deterministic() -> None: scenario = _scenario("math_reasoning", "medium") protocol = _good_protocol(scenario) first = score_feasibility(protocol, scenario) second = score_feasibility(protocol, scenario) assert first == second def test_feasibility_partial_credit_for_near_budget() -> None: """A protocol slightly over budget should score higher than one far over.""" scenario = _scenario("ml_benchmark", "easy") good = _good_protocol(scenario) slightly_over = good.model_copy(update={"sample_size": 40}) far_over = good.model_copy(update={"sample_size": 200}) score_slight = score_feasibility(slightly_over, scenario) score_far = score_feasibility(far_over, scenario) assert score_slight >= score_far def test_feasibility_all_domains_return_valid_range() -> None: for template in ("ml_benchmark", "math_reasoning", "finance_trading"): for difficulty in ("easy", "medium", "hard"): scenario = generate_scenario(seed=99, template=template, difficulty=difficulty) protocol = _good_protocol(scenario) score = score_feasibility(protocol, scenario) assert 0.0 <= score <= 1.0, f"{template}/{difficulty}: {score}" # --------------------------------------------------------------------------- # JDG 03 — score_fidelity # --------------------------------------------------------------------------- def test_fidelity_aligned_protocol_scores_higher() -> None: scenario = _scenario("ml_benchmark", "easy") aligned = _good_protocol(scenario) misaligned = _bad_protocol() aligned_score = score_fidelity(aligned, scenario) misaligned_score = score_fidelity(misaligned, scenario) assert aligned_score > misaligned_score assert 0.0 <= aligned_score <= 1.0 assert 0.0 <= misaligned_score <= 1.0 def test_fidelity_is_deterministic() -> None: scenario = _scenario("finance_trading", "hard") protocol = _good_protocol(scenario) first = score_fidelity(protocol, scenario) second = score_fidelity(protocol, scenario) assert first == second def test_fidelity_substitution_gets_partial_credit() -> None: """Using an allowed substitution should score better than a total miss.""" scenario = _scenario("math_reasoning", "easy") spec = scenario.hidden_reference_spec # Find a required element that has a substitution sub_map = {} for sub in scenario.allowed_substitutions: sub_map[sub.original.lower()] = sub.alternative if not sub_map or not spec.required_elements: return # skip if no substitution exists in this scenario # Build protocol that uses the substitution alternative first_sub_original = list(sub_map.keys())[0] first_sub_alt = sub_map[first_sub_original] with_sub = _good_protocol(scenario).model_copy(update={ "rationale": f"We will use {first_sub_alt} instead. " + spec.target_metric, }) without_anything = _bad_protocol() score_sub = score_fidelity(with_sub, scenario) score_miss = score_fidelity(without_anything, scenario) assert score_sub > score_miss def test_fidelity_mentioning_target_metric_improves_score() -> None: scenario = _scenario("ml_benchmark", "easy") spec = scenario.hidden_reference_spec with_metric = _good_protocol(scenario) without_metric = with_metric.model_copy(update={ "rationale": "Generic plan without any specific metric mentioned.", }) score_with = score_fidelity(with_metric, scenario) score_without = score_fidelity(without_metric, scenario) assert score_with >= score_without def test_fidelity_all_domains_return_valid_range() -> None: for template in ("ml_benchmark", "math_reasoning", "finance_trading"): for difficulty in ("easy", "medium", "hard"): scenario = generate_scenario(seed=99, template=template, difficulty=difficulty) protocol = _good_protocol(scenario) score = score_fidelity(protocol, scenario) assert 0.0 <= score <= 1.0, f"{template}/{difficulty}: {score}" # --------------------------------------------------------------------------- # Cross-scorer consistency # --------------------------------------------------------------------------- def test_all_scores_between_zero_and_one_for_bad_protocol() -> None: for template in ("ml_benchmark", "math_reasoning", "finance_trading"): scenario = generate_scenario(seed=7, template=template, difficulty="hard") bad = _bad_protocol() r = score_rigor(bad, scenario) fe = score_feasibility(bad, scenario) fi = score_fidelity(bad, scenario) assert 0.0 <= r <= 1.0, f"rigor {template}: {r}" assert 0.0 <= fe <= 1.0, f"feasibility {template}: {fe}" assert 0.0 <= fi <= 1.0, f"fidelity {template}: {fi}" def test_good_protocol_dominates_bad_on_rigor_and_fidelity() -> None: """Good protocol beats bad on rigor and fidelity. Feasibility is excluded: a protocol that asks for nothing is trivially feasible (no equipment, no reagents → nothing can fail). The other two scores correctly penalize an empty plan. """ scenario = _scenario("ml_benchmark", "easy") good = _good_protocol(scenario) bad = _bad_protocol() assert score_rigor(good, scenario) > score_rigor(bad, scenario) assert score_fidelity(good, scenario) > score_fidelity(bad, scenario) def test_good_protocol_beats_awful_protocol_on_all_scores_and_total_reward() -> None: """A clearly infeasible and low-quality protocol loses on every judge axis.""" scenario = _scenario("ml_benchmark", "easy") good = _good_protocol(scenario) awful = _awful_protocol(scenario) good_breakdown = build_reward_breakdown(good, scenario, rounds_used=2, max_rounds=6) awful_breakdown = build_reward_breakdown(awful, scenario, rounds_used=2, max_rounds=6) assert score_rigor(good, scenario) > score_rigor(awful, scenario) assert score_feasibility(good, scenario) > score_feasibility(awful, scenario) assert score_fidelity(good, scenario) > score_fidelity(awful, scenario) assert compute_total_reward(good_breakdown) > compute_total_reward(awful_breakdown) def test_rigor_explicit_success_criteria_mentions_improve_score() -> None: """Mentioning scenario success criteria should improve rigor coverage.""" scenario = _scenario("finance_trading", "easy").model_copy( update={ "success_criteria": ["risk-adjusted return", "drawdown control"], "hidden_reference_spec": HiddenReferenceSpec( summary="risk-aware replication plan", required_elements=[], flexible_elements=[], target_metric="sharpe ratio", target_value="> 1.5", ), } ) generic = _good_protocol(scenario).model_copy( update={"rationale": "Follow a generic plan with basic checks."} ) explicit = generic.model_copy( update={ "rationale": ( "Optimize for risk-adjusted return while preserving drawdown control " "through explicit checkpoints." ) } ) assert score_rigor(explicit, scenario) > score_rigor(generic, scenario) def test_feasibility_partial_equipment_credit_sits_between_full_and_total_miss() -> None: """One available requirement should score between full availability and a total miss.""" scenario = _scenario("ml_benchmark", "easy") available = list(scenario.lab_manager_observation.equipment_available) assert available, "scenario must expose at least one available equipment item" full = _good_protocol(scenario).model_copy( update={"required_equipment": [available[0]]} ) partial = full.model_copy( update={"required_equipment": [available[0], "Imaginary Device"]} ) miss = full.model_copy( update={"required_equipment": ["Imaginary Device", "Missing Device"]} ) full_score = score_feasibility(full, scenario) partial_score = score_feasibility(partial, scenario) miss_score = score_feasibility(miss, scenario) assert full_score > partial_score > miss_score def test_fidelity_direct_match_beats_substitution_and_miss() -> None: """Required-element scoring should prefer direct match > allowed substitution > miss.""" scenario = _scenario("math_reasoning", "easy").model_copy( update={ "hidden_reference_spec": HiddenReferenceSpec( summary="structured proof plan", required_elements=["alphaprobe"], flexible_elements=[], target_metric="accuracy", target_value="0.95", ), "allowed_substitutions": [ AllowedSubstitution( original="alphaprobe", alternative="betaprobe", condition="when the primary resource is booked", tradeoff="backup sensor is slower", ) ], } ) base = Protocol( sample_size=10, controls=["baseline", "ablation"], technique="structured proof plan", duration_days=1, required_equipment=[], required_reagents=[], rationale="Target accuracy 0.95 with explicit evaluation.", ) direct = base.model_copy( update={"rationale": base.rationale + " Use the alphaprobe."} ) substitution = base.model_copy( update={"rationale": base.rationale + " Use the betaprobe."} ) miss = base direct_score = score_fidelity(direct, scenario) substitution_score = score_fidelity(substitution, scenario) miss_score = score_fidelity(miss, scenario) assert direct_score > substitution_score > miss_score # --------------------------------------------------------------------------- # JDG 04 — compute_total_reward # --------------------------------------------------------------------------- def test_total_reward_perfect_beats_broken() -> None: """A well-aligned protocol earns a higher total reward than a bad one.""" scenario = _scenario("ml_benchmark", "easy") good = _good_protocol(scenario) bad = _bad_protocol() good_bd = build_reward_breakdown(good, scenario, rounds_used=1, max_rounds=6) bad_bd = build_reward_breakdown(bad, scenario, rounds_used=1, max_rounds=6) assert compute_total_reward(good_bd) > compute_total_reward(bad_bd) def test_zero_feasibility_zeroes_base() -> None: """If any component is 0, the multiplicative base is 0.""" rb = RewardBreakdown(rigor=1.0, feasibility=0.0, fidelity=1.0) assert compute_total_reward(rb) == 0.0 def test_efficiency_bonus_higher_when_faster() -> None: """Finishing in fewer rounds yields a higher total reward.""" scenario = _scenario() protocol = _good_protocol(scenario) fast = build_reward_breakdown(protocol, scenario, rounds_used=1, max_rounds=6) slow = build_reward_breakdown(protocol, scenario, rounds_used=5, max_rounds=6) assert compute_total_reward(fast) > compute_total_reward(slow) def test_penalty_subtraction_exact() -> None: """Named penalties subtract exactly from the total.""" rb = RewardBreakdown( rigor=1.0, feasibility=1.0, fidelity=1.0, penalties={"invalid_tool_use": 2.0, "unsupported_claim": 0.5}, ) total = compute_total_reward(rb) assert total == 7.5 # 10*1*1*1 - 2.5 def test_total_reward_clamps_at_zero() -> None: """Massive penalties cannot push the total below 0.""" rb = RewardBreakdown( rigor=0.1, feasibility=0.1, fidelity=0.1, penalties={"massive_penalty": 50.0}, ) assert compute_total_reward(rb) == 0.0 def test_breakdown_determinism() -> None: """Same inputs always produce the same total reward.""" scenario = _scenario("finance_trading", "medium") protocol = _good_protocol(scenario) b1 = build_reward_breakdown(protocol, scenario, rounds_used=3, max_rounds=6) b2 = build_reward_breakdown(protocol, scenario, rounds_used=3, max_rounds=6) assert compute_total_reward(b1) == compute_total_reward(b2) # --------------------------------------------------------------------------- # JDG 05 — build_reward_breakdown # --------------------------------------------------------------------------- def test_breakdown_accepts_external_penalties() -> None: """Callers can inject named penalty keys via the penalties parameter.""" scenario = _scenario() protocol = _good_protocol(scenario) bd = build_reward_breakdown( protocol, scenario, rounds_used=2, max_rounds=6, penalties={"invalid_tool_use": 1.0}, ) assert "invalid_tool_use" in bd.penalties assert bd.penalties["invalid_tool_use"] == 1.0 def test_breakdown_no_penalties_by_default() -> None: """Without external penalties, the dict is empty.""" scenario = _scenario() protocol = _good_protocol(scenario) bd = build_reward_breakdown(protocol, scenario, rounds_used=2, max_rounds=6) assert bd.penalties == {} def test_breakdown_matches_with_and_without_precomputed_feasibility_check() -> None: """Providing a precomputed feasibility check should not change the breakdown.""" scenario = _scenario("ml_benchmark", "medium") protocol = _good_protocol(scenario) precomputed = check_feasibility(protocol, scenario) with_check = build_reward_breakdown( protocol, scenario, rounds_used=3, max_rounds=6, check=precomputed, ) without_check = build_reward_breakdown( protocol, scenario, rounds_used=3, max_rounds=6, ) assert with_check == without_check # --------------------------------------------------------------------------- # JDG 06 — explain_reward # --------------------------------------------------------------------------- def test_explain_mentions_all_rubric_components() -> None: """Explanation must reference rigor, feasibility, and fidelity.""" bd = RewardBreakdown(rigor=0.8, feasibility=0.6, fidelity=0.9) text = explain_reward(bd) assert "Rigor:" in text assert "Feasibility:" in text assert "Fidelity:" in text assert "0.80" in text assert "0.60" in text assert "0.90" in text def test_explain_includes_penalties() -> None: """Each named penalty key appears in the explanation.""" bd = RewardBreakdown( rigor=0.5, feasibility=0.5, fidelity=0.5, penalties={"invalid_tool_use": 1.0, "unsupported_claim": 0.5}, ) text = explain_reward(bd) assert "invalid tool use" in text assert "unsupported claim" in text assert "-1.00" in text assert "-0.50" in text def test_explain_no_penalties_message() -> None: """When no penalties exist, the explanation says so.""" bd = RewardBreakdown(rigor=1.0, feasibility=1.0, fidelity=1.0) text = explain_reward(bd) assert "No penalties applied" in text def test_explain_includes_efficiency_bonus() -> None: """Efficiency bonus appears when present.""" bd = RewardBreakdown( rigor=0.7, feasibility=0.7, fidelity=0.7, efficiency_bonus=0.8, ) text = explain_reward(bd) assert "Efficiency bonus" in text assert "+0.80" in text def test_explain_omits_efficiency_bonus_when_zero() -> None: """Efficiency bonus line is absent when bonus is 0.""" bd = RewardBreakdown(rigor=0.7, feasibility=0.7, fidelity=0.7) text = explain_reward(bd) assert "Efficiency bonus" not in text def test_explain_shows_total_reward() -> None: """Explanation ends with the computed total reward.""" bd = RewardBreakdown(rigor=1.0, feasibility=1.0, fidelity=1.0) text = explain_reward(bd) assert "Total reward: 10.00" in text def test_explain_tier_labels() -> None: """Quality tier labels map correctly to score ranges.""" strong = RewardBreakdown(rigor=0.85, feasibility=0.5, fidelity=0.25) text = explain_reward(strong) assert "strong" in text # rigor 0.85 assert "moderate" in text # feasibility 0.5 assert "weak" in text # fidelity 0.25 def test_explain_deterministic() -> None: """Same breakdown always produces the same explanation.""" bd = RewardBreakdown( rigor=0.6, feasibility=0.4, fidelity=0.8, efficiency_bonus=0.5, penalties={"timeout": 0.3}, ) assert explain_reward(bd) == explain_reward(bd) def test_explain_with_real_breakdown() -> None: """Explanation works end-to-end with build_reward_breakdown output.""" scenario = _scenario("ml_benchmark", "easy") protocol = _good_protocol(scenario) bd = build_reward_breakdown(protocol, scenario, rounds_used=2, max_rounds=6) text = explain_reward(bd) assert "Rigor:" in text assert "Feasibility:" in text assert "Fidelity:" in text assert "Total reward:" in text