Spaces:

Rushhaabhhh
/

HONEST-RL-Calibrator

Sleeping

App Files Files Community

HONEST-RL-Calibrator / tests /test_difficulty.py

Rushhaabhhh

HONEST-RL-Calibrator-v0

3040767 verified about 1 month ago

raw

history blame contribute delete

8.71 kB

	"""Tests for server/difficulty.py."""

	import pytest

	from models.models import HonestState
	from server.difficulty import (
	HYSTERESIS_EPISODES,
	MAX_DIFFICULTY,
	MIN_DIFFICULTY,
	WINDOW,
	get_rolling_accuracy,
	update_difficulty,
	)


	# ---------------------------------------------------------------------------
	# Helpers
	# ---------------------------------------------------------------------------


	def make_state(domain: str = "math", difficulty: int = 3) -> HonestState:
	return HonestState(
	current_domain=domain,
	domain_difficulties={domain: difficulty},
	)


	def run_episodes(state: HonestState, outcomes: list, domain: str \| None = None):
	"""Feed a sequence of True/False/None outcomes into update_difficulty.

	Mirrors what HonestEnvironment.step() does: append the rich record first
	so update_difficulty (now a pure reader) can compute rolling accuracy from
	the current step included.
	"""
	active_domain = domain or state.current_domain
	diff = state.domain_difficulties.get(active_domain, 1)
	for outcome in outcomes:
	record: dict = {
	"domain": active_domain,
	"correct": outcome,
	"difficulty": diff,
	"difficulty_changed": False,
	}
	state.episode_history.append(record)
	new_diff, changed = update_difficulty(state, outcome, domain=active_domain)
	if changed:
	record["difficulty_changed"] = True
	diff = new_diff


	# ---------------------------------------------------------------------------
	# get_rolling_accuracy
	# ---------------------------------------------------------------------------


	class TestGetRollingAccuracy:
	def test_returns_neutral_for_empty_history(self):
	state = make_state()
	assert get_rolling_accuracy(state, "math") == pytest.approx(0.5)

	def test_all_correct(self):
	state = make_state()
	run_episodes(state, [True] * 5)
	assert get_rolling_accuracy(state, "math") == pytest.approx(1.0)

	def test_all_wrong(self):
	state = make_state()
	run_episodes(state, [False] * 5)
	assert get_rolling_accuracy(state, "math") == pytest.approx(0.0)

	def test_mixed(self):
	state = make_state()
	run_episodes(state, [True, True, False, False, True]) # 3/5 = 0.6
	assert get_rolling_accuracy(state, "math") == pytest.approx(0.6)

	def test_window_is_capped_at_20(self):
	state = make_state()
	# 25 episodes: first 5 wrong, last 20 all correct
	run_episodes(state, [False] * 5 + [True] * 20)
	# Window only sees last 20 — all correct
	assert get_rolling_accuracy(state, "math") == pytest.approx(1.0)

	def test_none_counted_as_wrong(self):
	state = make_state()
	run_episodes(state, [None, None, True]) # 1/3
	assert get_rolling_accuracy(state, "math") == pytest.approx(1 / 3)


	# ---------------------------------------------------------------------------
	# Difficulty increases
	# ---------------------------------------------------------------------------


	class TestDifficultyIncrease:
	def test_increases_after_sustained_high_accuracy(self):
	state = make_state(difficulty=2)
	# Fill window: 15 correct out of 15 → accuracy = 1.0 > 0.70
	# hysteresis: need gap of 10 from last change (init = 0), so episode 10+
	run_episodes(state, [True] * 15)
	assert state.domain_difficulties["math"] == 3

	def test_does_not_increase_before_hysteresis_clears(self):
	state = make_state(difficulty=2)
	# 9 correct → accuracy > 0.70 but hysteresis blocks (< 10 episodes since ep 0)
	run_episodes(state, [True] * 9)
	assert state.domain_difficulties["math"] == 2

	def test_caps_at_max_difficulty(self):
	state = make_state(difficulty=MAX_DIFFICULTY)
	run_episodes(state, [True] * 15)
	assert state.domain_difficulties["math"] == MAX_DIFFICULTY


	# ---------------------------------------------------------------------------
	# Difficulty decreases
	# ---------------------------------------------------------------------------


	class TestDifficultyDecrease:
	def test_decreases_after_sustained_low_accuracy(self):
	state = make_state(difficulty=3)
	# 15 wrong → accuracy = 0.0 < 0.30
	run_episodes(state, [False] * 15)
	assert state.domain_difficulties["math"] == 2

	def test_does_not_decrease_before_hysteresis_clears(self):
	state = make_state(difficulty=3)
	run_episodes(state, [False] * 9)
	assert state.domain_difficulties["math"] == 3

	def test_floors_at_min_difficulty(self):
	state = make_state(difficulty=MIN_DIFFICULTY)
	run_episodes(state, [False] * 15)
	assert state.domain_difficulties["math"] == MIN_DIFFICULTY


	# ---------------------------------------------------------------------------
	# Hysteresis — no rapid oscillation
	# ---------------------------------------------------------------------------


	class TestHysteresis:
	def test_no_second_change_before_hysteresis_window(self):
	state = make_state(difficulty=2)
	# First increase fires at episode ~10
	run_episodes(state, [True] * 11)
	diff_after_first = state.domain_difficulties["math"]
	assert diff_after_first == 3

	# Immediately continue with more correct answers — next change
	# should NOT fire until HYSTERESIS_EPISODES more episodes pass
	run_episodes(state, [True] * (HYSTERESIS_EPISODES - 1))
	assert state.domain_difficulties["math"] == 3 # still 3

	def test_second_change_fires_after_hysteresis_window(self):
	state = make_state(difficulty=2)
	# First change
	run_episodes(state, [True] * 11)
	assert state.domain_difficulties["math"] == 3

	# Give hysteresis window full room + enough data: HYSTERESIS_EPISODES + some
	# We need accuracy to remain > 0.70 and 10+ episodes since last change
	run_episodes(state, [True] * (HYSTERESIS_EPISODES + 1))
	assert state.domain_difficulties["math"] == 4

	def test_oscillation_blocked(self):
	"""Rapid correct→wrong cannot ping-pong the difficulty."""
	state = make_state(difficulty=3)
	# 11 correct → difficulty raised to 4
	run_episodes(state, [True] * 11)
	assert state.domain_difficulties["math"] == 4

	# Immediately 9 wrong → NOT enough to decrease (hysteresis + window)
	run_episodes(state, [False] * 9)
	assert state.domain_difficulties["math"] == 4 # no decrease yet


	# ---------------------------------------------------------------------------
	# Per-domain independence
	# ---------------------------------------------------------------------------


	class TestPerDomainIndependence:
	def test_math_accuracy_does_not_affect_code(self):
	state = HonestState(
	current_domain="math",
	domain_difficulties={"math": 2, "code": 3},
	)
	# Drive math accuracy very high
	run_episodes(state, [True] * 15, domain="math")
	# math goes up
	assert state.domain_difficulties["math"] > 2
	# code is untouched
	assert state.domain_difficulties["code"] == 3

	def test_separate_hysteresis_per_domain(self):
	state = HonestState(
	current_domain="math",
	domain_difficulties={"math": 2, "code": 2},
	)
	# Only drive math
	run_episodes(state, [True] * 11, domain="math")
	assert state.domain_difficulties["math"] == 3
	# code still untouched at 2
	assert state.domain_difficulties["code"] == 2

	def test_domains_track_independently(self):
	state = HonestState(
	current_domain="math",
	domain_difficulties={"math": 3, "logic": 3},
	)
	# Drive math down
	run_episodes(state, [False] * 15, domain="math")
	# Drive logic up
	run_episodes(state, [True] * 15, domain="logic")
	assert state.domain_difficulties["math"] < 3
	assert state.domain_difficulties["logic"] > 3


	# ---------------------------------------------------------------------------
	# Bounds enforcement
	# ---------------------------------------------------------------------------


	class TestBoundsEnforcement:
	@pytest.mark.parametrize("start", [1, 2, 3, 4, 5])
	def test_difficulty_always_in_range(self, start):
	state = make_state(difficulty=start)
	# Extreme runs in both directions
	run_episodes(state, [True] * 30 + [False] * 30)
	d = state.domain_difficulties["math"]
	assert MIN_DIFFICULTY <= d <= MAX_DIFFICULTY