HONEST-RL-Calibrator / tests /test_difficulty.py
Rushhaabhhh's picture
HONEST-RL-Calibrator-v0
3040767 verified
"""Tests for server/difficulty.py."""
import pytest
from models.models import HonestState
from server.difficulty import (
HYSTERESIS_EPISODES,
MAX_DIFFICULTY,
MIN_DIFFICULTY,
WINDOW,
get_rolling_accuracy,
update_difficulty,
)
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def make_state(domain: str = "math", difficulty: int = 3) -> HonestState:
return HonestState(
current_domain=domain,
domain_difficulties={domain: difficulty},
)
def run_episodes(state: HonestState, outcomes: list, domain: str | None = None):
"""Feed a sequence of True/False/None outcomes into update_difficulty.
Mirrors what HonestEnvironment.step() does: append the rich record first
so update_difficulty (now a pure reader) can compute rolling accuracy from
the current step included.
"""
active_domain = domain or state.current_domain
diff = state.domain_difficulties.get(active_domain, 1)
for outcome in outcomes:
record: dict = {
"domain": active_domain,
"correct": outcome,
"difficulty": diff,
"difficulty_changed": False,
}
state.episode_history.append(record)
new_diff, changed = update_difficulty(state, outcome, domain=active_domain)
if changed:
record["difficulty_changed"] = True
diff = new_diff
# ---------------------------------------------------------------------------
# get_rolling_accuracy
# ---------------------------------------------------------------------------
class TestGetRollingAccuracy:
def test_returns_neutral_for_empty_history(self):
state = make_state()
assert get_rolling_accuracy(state, "math") == pytest.approx(0.5)
def test_all_correct(self):
state = make_state()
run_episodes(state, [True] * 5)
assert get_rolling_accuracy(state, "math") == pytest.approx(1.0)
def test_all_wrong(self):
state = make_state()
run_episodes(state, [False] * 5)
assert get_rolling_accuracy(state, "math") == pytest.approx(0.0)
def test_mixed(self):
state = make_state()
run_episodes(state, [True, True, False, False, True]) # 3/5 = 0.6
assert get_rolling_accuracy(state, "math") == pytest.approx(0.6)
def test_window_is_capped_at_20(self):
state = make_state()
# 25 episodes: first 5 wrong, last 20 all correct
run_episodes(state, [False] * 5 + [True] * 20)
# Window only sees last 20 — all correct
assert get_rolling_accuracy(state, "math") == pytest.approx(1.0)
def test_none_counted_as_wrong(self):
state = make_state()
run_episodes(state, [None, None, True]) # 1/3
assert get_rolling_accuracy(state, "math") == pytest.approx(1 / 3)
# ---------------------------------------------------------------------------
# Difficulty increases
# ---------------------------------------------------------------------------
class TestDifficultyIncrease:
def test_increases_after_sustained_high_accuracy(self):
state = make_state(difficulty=2)
# Fill window: 15 correct out of 15 → accuracy = 1.0 > 0.70
# hysteresis: need gap of 10 from last change (init = 0), so episode 10+
run_episodes(state, [True] * 15)
assert state.domain_difficulties["math"] == 3
def test_does_not_increase_before_hysteresis_clears(self):
state = make_state(difficulty=2)
# 9 correct → accuracy > 0.70 but hysteresis blocks (< 10 episodes since ep 0)
run_episodes(state, [True] * 9)
assert state.domain_difficulties["math"] == 2
def test_caps_at_max_difficulty(self):
state = make_state(difficulty=MAX_DIFFICULTY)
run_episodes(state, [True] * 15)
assert state.domain_difficulties["math"] == MAX_DIFFICULTY
# ---------------------------------------------------------------------------
# Difficulty decreases
# ---------------------------------------------------------------------------
class TestDifficultyDecrease:
def test_decreases_after_sustained_low_accuracy(self):
state = make_state(difficulty=3)
# 15 wrong → accuracy = 0.0 < 0.30
run_episodes(state, [False] * 15)
assert state.domain_difficulties["math"] == 2
def test_does_not_decrease_before_hysteresis_clears(self):
state = make_state(difficulty=3)
run_episodes(state, [False] * 9)
assert state.domain_difficulties["math"] == 3
def test_floors_at_min_difficulty(self):
state = make_state(difficulty=MIN_DIFFICULTY)
run_episodes(state, [False] * 15)
assert state.domain_difficulties["math"] == MIN_DIFFICULTY
# ---------------------------------------------------------------------------
# Hysteresis — no rapid oscillation
# ---------------------------------------------------------------------------
class TestHysteresis:
def test_no_second_change_before_hysteresis_window(self):
state = make_state(difficulty=2)
# First increase fires at episode ~10
run_episodes(state, [True] * 11)
diff_after_first = state.domain_difficulties["math"]
assert diff_after_first == 3
# Immediately continue with more correct answers — next change
# should NOT fire until HYSTERESIS_EPISODES more episodes pass
run_episodes(state, [True] * (HYSTERESIS_EPISODES - 1))
assert state.domain_difficulties["math"] == 3 # still 3
def test_second_change_fires_after_hysteresis_window(self):
state = make_state(difficulty=2)
# First change
run_episodes(state, [True] * 11)
assert state.domain_difficulties["math"] == 3
# Give hysteresis window full room + enough data: HYSTERESIS_EPISODES + some
# We need accuracy to remain > 0.70 and 10+ episodes since last change
run_episodes(state, [True] * (HYSTERESIS_EPISODES + 1))
assert state.domain_difficulties["math"] == 4
def test_oscillation_blocked(self):
"""Rapid correct→wrong cannot ping-pong the difficulty."""
state = make_state(difficulty=3)
# 11 correct → difficulty raised to 4
run_episodes(state, [True] * 11)
assert state.domain_difficulties["math"] == 4
# Immediately 9 wrong → NOT enough to decrease (hysteresis + window)
run_episodes(state, [False] * 9)
assert state.domain_difficulties["math"] == 4 # no decrease yet
# ---------------------------------------------------------------------------
# Per-domain independence
# ---------------------------------------------------------------------------
class TestPerDomainIndependence:
def test_math_accuracy_does_not_affect_code(self):
state = HonestState(
current_domain="math",
domain_difficulties={"math": 2, "code": 3},
)
# Drive math accuracy very high
run_episodes(state, [True] * 15, domain="math")
# math goes up
assert state.domain_difficulties["math"] > 2
# code is untouched
assert state.domain_difficulties["code"] == 3
def test_separate_hysteresis_per_domain(self):
state = HonestState(
current_domain="math",
domain_difficulties={"math": 2, "code": 2},
)
# Only drive math
run_episodes(state, [True] * 11, domain="math")
assert state.domain_difficulties["math"] == 3
# code still untouched at 2
assert state.domain_difficulties["code"] == 2
def test_domains_track_independently(self):
state = HonestState(
current_domain="math",
domain_difficulties={"math": 3, "logic": 3},
)
# Drive math down
run_episodes(state, [False] * 15, domain="math")
# Drive logic up
run_episodes(state, [True] * 15, domain="logic")
assert state.domain_difficulties["math"] < 3
assert state.domain_difficulties["logic"] > 3
# ---------------------------------------------------------------------------
# Bounds enforcement
# ---------------------------------------------------------------------------
class TestBoundsEnforcement:
@pytest.mark.parametrize("start", [1, 2, 3, 4, 5])
def test_difficulty_always_in_range(self, start):
state = make_state(difficulty=start)
# Extreme runs in both directions
run_episodes(state, [True] * 30 + [False] * 30)
d = state.domain_difficulties["math"]
assert MIN_DIFFICULTY <= d <= MAX_DIFFICULTY