Spaces:
Running
Running
| """Tests for JDG 01–06 scoring functions.""" | |
| from __future__ import annotations | |
| from replicalab.agents.lab_manager_policy import check_feasibility | |
| from replicalab.models import Protocol, RewardBreakdown | |
| from replicalab.scenarios import generate_scenario | |
| from replicalab.scenarios.templates import AllowedSubstitution, HiddenReferenceSpec | |
| from replicalab.scoring import ( | |
| build_reward_breakdown, | |
| compute_total_reward, | |
| explain_reward, | |
| score_feasibility, | |
| score_fidelity, | |
| score_rigor, | |
| ) | |
| # --------------------------------------------------------------------------- | |
| # Helpers | |
| # --------------------------------------------------------------------------- | |
| def _scenario(template: str = "ml_benchmark", difficulty: str = "easy"): | |
| return generate_scenario(seed=42, template=template, difficulty=difficulty) | |
| def _good_protocol(scenario) -> Protocol: | |
| """Build a well-formed protocol aligned to the scenario.""" | |
| lab = scenario.lab_manager_observation | |
| spec = scenario.hidden_reference_spec | |
| return Protocol( | |
| sample_size=10, | |
| controls=["baseline", "ablation"], | |
| technique=spec.summary[:60] if spec.summary else "replication_plan", | |
| duration_days=max(1, min(2, lab.time_limit_days)), | |
| required_equipment=( | |
| list(lab.equipment_available[:1]) | |
| if lab.equipment_available | |
| else [] | |
| ), | |
| required_reagents=( | |
| list(lab.reagents_in_stock[:1]) | |
| if lab.reagents_in_stock | |
| else [] | |
| ), | |
| rationale=( | |
| f"Plan addresses: {', '.join(spec.required_elements[:2])}. " | |
| f"Target metric: {spec.target_metric}. " | |
| f"Target value: {spec.target_value}. " | |
| "Stay within budget and schedule." | |
| ), | |
| ) | |
| def _bad_protocol() -> Protocol: | |
| """Build a minimal protocol that misses most requirements.""" | |
| return Protocol( | |
| sample_size=1, | |
| controls=[], | |
| technique="unknown_method", | |
| duration_days=1, | |
| required_equipment=[], | |
| required_reagents=[], | |
| rationale="No plan.", | |
| ) | |
| def _awful_protocol(scenario) -> Protocol: | |
| """Build a structurally weak and clearly infeasible protocol.""" | |
| return Protocol( | |
| sample_size=200, | |
| controls=[], | |
| technique="imaginary_method", | |
| duration_days=scenario.lab_manager_observation.time_limit_days + 5, | |
| required_equipment=["Imaginary Device"], | |
| required_reagents=["Imaginary Reagent"], | |
| rationale="No.", | |
| ) | |
| # --------------------------------------------------------------------------- | |
| # JDG 01 — score_rigor | |
| # --------------------------------------------------------------------------- | |
| def test_rigor_good_protocol_scores_higher_than_bad() -> None: | |
| scenario = _scenario("ml_benchmark", "easy") | |
| good = _good_protocol(scenario) | |
| bad = _bad_protocol() | |
| good_score = score_rigor(good, scenario) | |
| bad_score = score_rigor(bad, scenario) | |
| assert good_score > bad_score | |
| assert 0.0 <= good_score <= 1.0 | |
| assert 0.0 <= bad_score <= 1.0 | |
| def test_rigor_is_deterministic() -> None: | |
| scenario = _scenario("ml_benchmark", "medium") | |
| protocol = _good_protocol(scenario) | |
| first = score_rigor(protocol, scenario) | |
| second = score_rigor(protocol, scenario) | |
| assert first == second | |
| def test_rigor_empty_controls_reduces_score() -> None: | |
| scenario = _scenario("math_reasoning", "easy") | |
| with_controls = _good_protocol(scenario) | |
| without_controls = with_controls.model_copy(update={"controls": ["only_one"]}) | |
| score_with = score_rigor(with_controls, scenario) | |
| score_without = score_rigor(without_controls, scenario) | |
| assert score_with >= score_without | |
| def test_rigor_short_rationale_reduces_score() -> None: | |
| scenario = _scenario("finance_trading", "easy") | |
| good = _good_protocol(scenario) | |
| short = good.model_copy(update={"rationale": "OK."}) | |
| assert score_rigor(good, scenario) > score_rigor(short, scenario) | |
| def test_rigor_all_domains_return_valid_range() -> None: | |
| for template in ("ml_benchmark", "math_reasoning", "finance_trading"): | |
| for difficulty in ("easy", "medium", "hard"): | |
| scenario = generate_scenario(seed=99, template=template, difficulty=difficulty) | |
| protocol = _good_protocol(scenario) | |
| score = score_rigor(protocol, scenario) | |
| assert 0.0 <= score <= 1.0, f"{template}/{difficulty}: {score}" | |
| # --------------------------------------------------------------------------- | |
| # JDG 02 — score_feasibility | |
| # --------------------------------------------------------------------------- | |
| def test_feasibility_viable_protocol_scores_high() -> None: | |
| scenario = _scenario("ml_benchmark", "easy") | |
| protocol = _good_protocol(scenario) | |
| score = score_feasibility(protocol, scenario) | |
| assert score > 0.7 | |
| assert 0.0 <= score <= 1.0 | |
| def test_feasibility_infeasible_protocol_scores_lower() -> None: | |
| scenario = _scenario("ml_benchmark", "easy") | |
| good = _good_protocol(scenario) | |
| # Blow the budget and schedule | |
| bad = good.model_copy(update={ | |
| "sample_size": 200, | |
| "duration_days": scenario.lab_manager_observation.time_limit_days + 5, | |
| "required_equipment": ["Imaginary Device"], | |
| }) | |
| good_score = score_feasibility(good, scenario) | |
| bad_score = score_feasibility(bad, scenario) | |
| assert good_score > bad_score | |
| def test_feasibility_accepts_precomputed_check() -> None: | |
| scenario = _scenario("finance_trading", "easy") | |
| protocol = _good_protocol(scenario) | |
| check = check_feasibility(protocol, scenario) | |
| score_with = score_feasibility(protocol, scenario, check=check) | |
| score_without = score_feasibility(protocol, scenario) | |
| assert score_with == score_without | |
| def test_feasibility_is_deterministic() -> None: | |
| scenario = _scenario("math_reasoning", "medium") | |
| protocol = _good_protocol(scenario) | |
| first = score_feasibility(protocol, scenario) | |
| second = score_feasibility(protocol, scenario) | |
| assert first == second | |
| def test_feasibility_partial_credit_for_near_budget() -> None: | |
| """A protocol slightly over budget should score higher than one far over.""" | |
| scenario = _scenario("ml_benchmark", "easy") | |
| good = _good_protocol(scenario) | |
| slightly_over = good.model_copy(update={"sample_size": 40}) | |
| far_over = good.model_copy(update={"sample_size": 200}) | |
| score_slight = score_feasibility(slightly_over, scenario) | |
| score_far = score_feasibility(far_over, scenario) | |
| assert score_slight >= score_far | |
| def test_feasibility_all_domains_return_valid_range() -> None: | |
| for template in ("ml_benchmark", "math_reasoning", "finance_trading"): | |
| for difficulty in ("easy", "medium", "hard"): | |
| scenario = generate_scenario(seed=99, template=template, difficulty=difficulty) | |
| protocol = _good_protocol(scenario) | |
| score = score_feasibility(protocol, scenario) | |
| assert 0.0 <= score <= 1.0, f"{template}/{difficulty}: {score}" | |
| # --------------------------------------------------------------------------- | |
| # JDG 03 — score_fidelity | |
| # --------------------------------------------------------------------------- | |
| def test_fidelity_aligned_protocol_scores_higher() -> None: | |
| scenario = _scenario("ml_benchmark", "easy") | |
| aligned = _good_protocol(scenario) | |
| misaligned = _bad_protocol() | |
| aligned_score = score_fidelity(aligned, scenario) | |
| misaligned_score = score_fidelity(misaligned, scenario) | |
| assert aligned_score > misaligned_score | |
| assert 0.0 <= aligned_score <= 1.0 | |
| assert 0.0 <= misaligned_score <= 1.0 | |
| def test_fidelity_is_deterministic() -> None: | |
| scenario = _scenario("finance_trading", "hard") | |
| protocol = _good_protocol(scenario) | |
| first = score_fidelity(protocol, scenario) | |
| second = score_fidelity(protocol, scenario) | |
| assert first == second | |
| def test_fidelity_substitution_gets_partial_credit() -> None: | |
| """Using an allowed substitution should score better than a total miss.""" | |
| scenario = _scenario("math_reasoning", "easy") | |
| spec = scenario.hidden_reference_spec | |
| # Find a required element that has a substitution | |
| sub_map = {} | |
| for sub in scenario.allowed_substitutions: | |
| sub_map[sub.original.lower()] = sub.alternative | |
| if not sub_map or not spec.required_elements: | |
| return # skip if no substitution exists in this scenario | |
| # Build protocol that uses the substitution alternative | |
| first_sub_original = list(sub_map.keys())[0] | |
| first_sub_alt = sub_map[first_sub_original] | |
| with_sub = _good_protocol(scenario).model_copy(update={ | |
| "rationale": f"We will use {first_sub_alt} instead. " + spec.target_metric, | |
| }) | |
| without_anything = _bad_protocol() | |
| score_sub = score_fidelity(with_sub, scenario) | |
| score_miss = score_fidelity(without_anything, scenario) | |
| assert score_sub > score_miss | |
| def test_fidelity_mentioning_target_metric_improves_score() -> None: | |
| scenario = _scenario("ml_benchmark", "easy") | |
| spec = scenario.hidden_reference_spec | |
| with_metric = _good_protocol(scenario) | |
| without_metric = with_metric.model_copy(update={ | |
| "rationale": "Generic plan without any specific metric mentioned.", | |
| }) | |
| score_with = score_fidelity(with_metric, scenario) | |
| score_without = score_fidelity(without_metric, scenario) | |
| assert score_with >= score_without | |
| def test_fidelity_all_domains_return_valid_range() -> None: | |
| for template in ("ml_benchmark", "math_reasoning", "finance_trading"): | |
| for difficulty in ("easy", "medium", "hard"): | |
| scenario = generate_scenario(seed=99, template=template, difficulty=difficulty) | |
| protocol = _good_protocol(scenario) | |
| score = score_fidelity(protocol, scenario) | |
| assert 0.0 <= score <= 1.0, f"{template}/{difficulty}: {score}" | |
| # --------------------------------------------------------------------------- | |
| # Cross-scorer consistency | |
| # --------------------------------------------------------------------------- | |
| def test_all_scores_between_zero_and_one_for_bad_protocol() -> None: | |
| for template in ("ml_benchmark", "math_reasoning", "finance_trading"): | |
| scenario = generate_scenario(seed=7, template=template, difficulty="hard") | |
| bad = _bad_protocol() | |
| r = score_rigor(bad, scenario) | |
| fe = score_feasibility(bad, scenario) | |
| fi = score_fidelity(bad, scenario) | |
| assert 0.0 <= r <= 1.0, f"rigor {template}: {r}" | |
| assert 0.0 <= fe <= 1.0, f"feasibility {template}: {fe}" | |
| assert 0.0 <= fi <= 1.0, f"fidelity {template}: {fi}" | |
| def test_good_protocol_dominates_bad_on_rigor_and_fidelity() -> None: | |
| """Good protocol beats bad on rigor and fidelity. | |
| Feasibility is excluded: a protocol that asks for nothing is trivially | |
| feasible (no equipment, no reagents → nothing can fail). The other two | |
| scores correctly penalize an empty plan. | |
| """ | |
| scenario = _scenario("ml_benchmark", "easy") | |
| good = _good_protocol(scenario) | |
| bad = _bad_protocol() | |
| assert score_rigor(good, scenario) > score_rigor(bad, scenario) | |
| assert score_fidelity(good, scenario) > score_fidelity(bad, scenario) | |
| def test_good_protocol_beats_awful_protocol_on_all_scores_and_total_reward() -> None: | |
| """A clearly infeasible and low-quality protocol loses on every judge axis.""" | |
| scenario = _scenario("ml_benchmark", "easy") | |
| good = _good_protocol(scenario) | |
| awful = _awful_protocol(scenario) | |
| good_breakdown = build_reward_breakdown(good, scenario, rounds_used=2, max_rounds=6) | |
| awful_breakdown = build_reward_breakdown(awful, scenario, rounds_used=2, max_rounds=6) | |
| assert score_rigor(good, scenario) > score_rigor(awful, scenario) | |
| assert score_feasibility(good, scenario) > score_feasibility(awful, scenario) | |
| assert score_fidelity(good, scenario) > score_fidelity(awful, scenario) | |
| assert compute_total_reward(good_breakdown) > compute_total_reward(awful_breakdown) | |
| def test_rigor_explicit_success_criteria_mentions_improve_score() -> None: | |
| """Mentioning scenario success criteria should improve rigor coverage.""" | |
| scenario = _scenario("finance_trading", "easy").model_copy( | |
| update={ | |
| "success_criteria": ["risk-adjusted return", "drawdown control"], | |
| "hidden_reference_spec": HiddenReferenceSpec( | |
| summary="risk-aware replication plan", | |
| required_elements=[], | |
| flexible_elements=[], | |
| target_metric="sharpe ratio", | |
| target_value="> 1.5", | |
| ), | |
| } | |
| ) | |
| generic = _good_protocol(scenario).model_copy( | |
| update={"rationale": "Follow a generic plan with basic checks."} | |
| ) | |
| explicit = generic.model_copy( | |
| update={ | |
| "rationale": ( | |
| "Optimize for risk-adjusted return while preserving drawdown control " | |
| "through explicit checkpoints." | |
| ) | |
| } | |
| ) | |
| assert score_rigor(explicit, scenario) > score_rigor(generic, scenario) | |
| def test_feasibility_partial_equipment_credit_sits_between_full_and_total_miss() -> None: | |
| """One available requirement should score between full availability and a total miss.""" | |
| scenario = _scenario("ml_benchmark", "easy") | |
| available = list(scenario.lab_manager_observation.equipment_available) | |
| assert available, "scenario must expose at least one available equipment item" | |
| full = _good_protocol(scenario).model_copy( | |
| update={"required_equipment": [available[0]]} | |
| ) | |
| partial = full.model_copy( | |
| update={"required_equipment": [available[0], "Imaginary Device"]} | |
| ) | |
| miss = full.model_copy( | |
| update={"required_equipment": ["Imaginary Device", "Missing Device"]} | |
| ) | |
| full_score = score_feasibility(full, scenario) | |
| partial_score = score_feasibility(partial, scenario) | |
| miss_score = score_feasibility(miss, scenario) | |
| assert full_score > partial_score > miss_score | |
| def test_fidelity_direct_match_beats_substitution_and_miss() -> None: | |
| """Required-element scoring should prefer direct match > allowed substitution > miss.""" | |
| scenario = _scenario("math_reasoning", "easy").model_copy( | |
| update={ | |
| "hidden_reference_spec": HiddenReferenceSpec( | |
| summary="structured proof plan", | |
| required_elements=["alphaprobe"], | |
| flexible_elements=[], | |
| target_metric="accuracy", | |
| target_value="0.95", | |
| ), | |
| "allowed_substitutions": [ | |
| AllowedSubstitution( | |
| original="alphaprobe", | |
| alternative="betaprobe", | |
| condition="when the primary resource is booked", | |
| tradeoff="backup sensor is slower", | |
| ) | |
| ], | |
| } | |
| ) | |
| base = Protocol( | |
| sample_size=10, | |
| controls=["baseline", "ablation"], | |
| technique="structured proof plan", | |
| duration_days=1, | |
| required_equipment=[], | |
| required_reagents=[], | |
| rationale="Target accuracy 0.95 with explicit evaluation.", | |
| ) | |
| direct = base.model_copy( | |
| update={"rationale": base.rationale + " Use the alphaprobe."} | |
| ) | |
| substitution = base.model_copy( | |
| update={"rationale": base.rationale + " Use the betaprobe."} | |
| ) | |
| miss = base | |
| direct_score = score_fidelity(direct, scenario) | |
| substitution_score = score_fidelity(substitution, scenario) | |
| miss_score = score_fidelity(miss, scenario) | |
| assert direct_score > substitution_score > miss_score | |
| # --------------------------------------------------------------------------- | |
| # JDG 04 — compute_total_reward | |
| # --------------------------------------------------------------------------- | |
| def test_total_reward_perfect_beats_broken() -> None: | |
| """A well-aligned protocol earns a higher total reward than a bad one.""" | |
| scenario = _scenario("ml_benchmark", "easy") | |
| good = _good_protocol(scenario) | |
| bad = _bad_protocol() | |
| good_bd = build_reward_breakdown(good, scenario, rounds_used=1, max_rounds=6) | |
| bad_bd = build_reward_breakdown(bad, scenario, rounds_used=1, max_rounds=6) | |
| assert compute_total_reward(good_bd) > compute_total_reward(bad_bd) | |
| def test_zero_feasibility_zeroes_base() -> None: | |
| """If any component is 0, the multiplicative base is 0.""" | |
| rb = RewardBreakdown(rigor=1.0, feasibility=0.0, fidelity=1.0) | |
| assert compute_total_reward(rb) == 0.0 | |
| def test_efficiency_bonus_higher_when_faster() -> None: | |
| """Finishing in fewer rounds yields a higher total reward.""" | |
| scenario = _scenario() | |
| protocol = _good_protocol(scenario) | |
| fast = build_reward_breakdown(protocol, scenario, rounds_used=1, max_rounds=6) | |
| slow = build_reward_breakdown(protocol, scenario, rounds_used=5, max_rounds=6) | |
| assert compute_total_reward(fast) > compute_total_reward(slow) | |
| def test_penalty_subtraction_exact() -> None: | |
| """Named penalties subtract exactly from the total.""" | |
| rb = RewardBreakdown( | |
| rigor=1.0, | |
| feasibility=1.0, | |
| fidelity=1.0, | |
| penalties={"invalid_tool_use": 2.0, "unsupported_claim": 0.5}, | |
| ) | |
| total = compute_total_reward(rb) | |
| assert total == 7.5 # 10*1*1*1 - 2.5 | |
| def test_total_reward_clamps_at_zero() -> None: | |
| """Massive penalties cannot push the total below 0.""" | |
| rb = RewardBreakdown( | |
| rigor=0.1, | |
| feasibility=0.1, | |
| fidelity=0.1, | |
| penalties={"massive_penalty": 50.0}, | |
| ) | |
| assert compute_total_reward(rb) == 0.0 | |
| def test_breakdown_determinism() -> None: | |
| """Same inputs always produce the same total reward.""" | |
| scenario = _scenario("finance_trading", "medium") | |
| protocol = _good_protocol(scenario) | |
| b1 = build_reward_breakdown(protocol, scenario, rounds_used=3, max_rounds=6) | |
| b2 = build_reward_breakdown(protocol, scenario, rounds_used=3, max_rounds=6) | |
| assert compute_total_reward(b1) == compute_total_reward(b2) | |
| # --------------------------------------------------------------------------- | |
| # JDG 05 — build_reward_breakdown | |
| # --------------------------------------------------------------------------- | |
| def test_breakdown_accepts_external_penalties() -> None: | |
| """Callers can inject named penalty keys via the penalties parameter.""" | |
| scenario = _scenario() | |
| protocol = _good_protocol(scenario) | |
| bd = build_reward_breakdown( | |
| protocol, scenario, rounds_used=2, max_rounds=6, | |
| penalties={"invalid_tool_use": 1.0}, | |
| ) | |
| assert "invalid_tool_use" in bd.penalties | |
| assert bd.penalties["invalid_tool_use"] == 1.0 | |
| def test_breakdown_no_penalties_by_default() -> None: | |
| """Without external penalties, the dict is empty.""" | |
| scenario = _scenario() | |
| protocol = _good_protocol(scenario) | |
| bd = build_reward_breakdown(protocol, scenario, rounds_used=2, max_rounds=6) | |
| assert bd.penalties == {} | |
| def test_breakdown_matches_with_and_without_precomputed_feasibility_check() -> None: | |
| """Providing a precomputed feasibility check should not change the breakdown.""" | |
| scenario = _scenario("ml_benchmark", "medium") | |
| protocol = _good_protocol(scenario) | |
| precomputed = check_feasibility(protocol, scenario) | |
| with_check = build_reward_breakdown( | |
| protocol, | |
| scenario, | |
| rounds_used=3, | |
| max_rounds=6, | |
| check=precomputed, | |
| ) | |
| without_check = build_reward_breakdown( | |
| protocol, | |
| scenario, | |
| rounds_used=3, | |
| max_rounds=6, | |
| ) | |
| assert with_check == without_check | |
| # --------------------------------------------------------------------------- | |
| # JDG 06 — explain_reward | |
| # --------------------------------------------------------------------------- | |
| def test_explain_mentions_all_rubric_components() -> None: | |
| """Explanation must reference rigor, feasibility, and fidelity.""" | |
| bd = RewardBreakdown(rigor=0.8, feasibility=0.6, fidelity=0.9) | |
| text = explain_reward(bd) | |
| assert "Rigor:" in text | |
| assert "Feasibility:" in text | |
| assert "Fidelity:" in text | |
| assert "0.80" in text | |
| assert "0.60" in text | |
| assert "0.90" in text | |
| def test_explain_includes_penalties() -> None: | |
| """Each named penalty key appears in the explanation.""" | |
| bd = RewardBreakdown( | |
| rigor=0.5, | |
| feasibility=0.5, | |
| fidelity=0.5, | |
| penalties={"invalid_tool_use": 1.0, "unsupported_claim": 0.5}, | |
| ) | |
| text = explain_reward(bd) | |
| assert "invalid tool use" in text | |
| assert "unsupported claim" in text | |
| assert "-1.00" in text | |
| assert "-0.50" in text | |
| def test_explain_no_penalties_message() -> None: | |
| """When no penalties exist, the explanation says so.""" | |
| bd = RewardBreakdown(rigor=1.0, feasibility=1.0, fidelity=1.0) | |
| text = explain_reward(bd) | |
| assert "No penalties applied" in text | |
| def test_explain_includes_efficiency_bonus() -> None: | |
| """Efficiency bonus appears when present.""" | |
| bd = RewardBreakdown( | |
| rigor=0.7, feasibility=0.7, fidelity=0.7, efficiency_bonus=0.8, | |
| ) | |
| text = explain_reward(bd) | |
| assert "Efficiency bonus" in text | |
| assert "+0.80" in text | |
| def test_explain_omits_efficiency_bonus_when_zero() -> None: | |
| """Efficiency bonus line is absent when bonus is 0.""" | |
| bd = RewardBreakdown(rigor=0.7, feasibility=0.7, fidelity=0.7) | |
| text = explain_reward(bd) | |
| assert "Efficiency bonus" not in text | |
| def test_explain_shows_total_reward() -> None: | |
| """Explanation ends with the computed total reward.""" | |
| bd = RewardBreakdown(rigor=1.0, feasibility=1.0, fidelity=1.0) | |
| text = explain_reward(bd) | |
| assert "Total reward: 10.00" in text | |
| def test_explain_tier_labels() -> None: | |
| """Quality tier labels map correctly to score ranges.""" | |
| strong = RewardBreakdown(rigor=0.85, feasibility=0.5, fidelity=0.25) | |
| text = explain_reward(strong) | |
| assert "strong" in text # rigor 0.85 | |
| assert "moderate" in text # feasibility 0.5 | |
| assert "weak" in text # fidelity 0.25 | |
| def test_explain_deterministic() -> None: | |
| """Same breakdown always produces the same explanation.""" | |
| bd = RewardBreakdown( | |
| rigor=0.6, feasibility=0.4, fidelity=0.8, | |
| efficiency_bonus=0.5, penalties={"timeout": 0.3}, | |
| ) | |
| assert explain_reward(bd) == explain_reward(bd) | |
| def test_explain_with_real_breakdown() -> None: | |
| """Explanation works end-to-end with build_reward_breakdown output.""" | |
| scenario = _scenario("ml_benchmark", "easy") | |
| protocol = _good_protocol(scenario) | |
| bd = build_reward_breakdown(protocol, scenario, rounds_used=2, max_rounds=6) | |
| text = explain_reward(bd) | |
| assert "Rigor:" in text | |
| assert "Feasibility:" in text | |
| assert "Fidelity:" in text | |
| assert "Total reward:" in text | |