readCtrl_lambda / code /RL_model /unsloth_rl /test_reward_mock_unittest.py

mshahidul

Initial commit of readCtrl code without large models

030876e 24 days ago

4.66 kB

	"""Minimal, offline tests for reward_mock.py.

	Run:
	python code/RL_model/unsloth_rl/test_reward_mock_unittest.py

	These tests avoid real OpenAI calls by:
	- mocking the API key file read
	- stubbing OpenAI client construction
	- overriding verifier.evaluate_level to deterministic outputs
	"""

	from __future__ import annotations

	import importlib.util
	import sys
	import types
	import unittest
	from pathlib import Path
	from unittest.mock import mock_open, patch


	THIS_DIR = Path(__file__).resolve().parent
	REWARD_MOCK_PATH = THIS_DIR / "reward_mock.py"


	class FakeOpenAI:
	def __init__(self, api_key: str \| None = None, **_kwargs):
	self.api_key = api_key


	def load_reward_mock_module():
	"""Load reward_mock.py from its file path under test-friendly patches."""
	module_name = "reward_mock_under_test"
	if module_name in sys.modules:
	del sys.modules[module_name]

	spec = importlib.util.spec_from_file_location(module_name, str(REWARD_MOCK_PATH))
	if spec is None or spec.loader is None:
	raise RuntimeError(f"Failed to create import spec for {REWARD_MOCK_PATH}")

	module = importlib.util.module_from_spec(spec)

	# Ensure 'openai' import is available and OpenAI ctor is patched.
	# reward_mock does: `from openai import OpenAI`
	with patch("builtins.open", mock_open(read_data='{"openai": "sk-test"}')):
	with patch("openai.OpenAI", FakeOpenAI):
	spec.loader.exec_module(module)

	sys.modules[module_name] = module
	return module


	class TestRewardMockComputeScore(unittest.TestCase):
	def test_valid_json_progression_no_hierarchy_penalty(self):
	rm = load_reward_mock_module()

	def fake_evaluate_level(gen_text, gold_subs, full_subs):
	# Return (comp, cov) deterministically based on the generated text.
	if gen_text == "LOW":
	return 1.0, 0.3000
	if gen_text == "INTER":
	return 1.0, 0.4000
	if gen_text == "PRO":
	return 1.0, 0.9500
	return 0.0, 0.0

	rm.verifier.evaluate_level = fake_evaluate_level

	solution_str = """```json
	{
	"low_health_literacy": "LOW",
	"intermediate_health_literacy": "INTER",
	"proficient_health_literacy": "PRO"
	}
	```"""

	ground_truth = {
	"summary_subclaims": ["a", "b"],
	"fulltext_subclaims": ["x", "y", "z"],
	}

	score = rm.compute_score(data_source=None, solution_str=solution_str, ground_truth=ground_truth)

	# comp thresholds are 1.0 -> comp deltas = 0
	# cov deltas: (0.3000-0.3226) + (0.4000-0.4091) + (0.9500-0.9347) = -0.0164
	self.assertAlmostEqual(score, -0.0164, places=4)

	def test_missing_field_penalizes_and_triggers_hierarchy_penalty(self):
	rm = load_reward_mock_module()

	def fake_evaluate_level(gen_text, gold_subs, full_subs):
	if gen_text == "LOW":
	return 1.0, 0.3000
	if gen_text == "PRO":
	return 1.0, 0.9500
	return 0.0, 0.0

	rm.verifier.evaluate_level = fake_evaluate_level

	# intermediate is missing => -1.0
	# BUT its cov will be 0.0 for the hierarchy check, so low_cov(0.3) <= int_cov(0.0) fails => -2.0
	solution_str = '{"low_health_literacy": "LOW", "proficient_health_literacy": "PRO"}'

	ground_truth = {
	"summary_subclaims": ["a"],
	"fulltext_subclaims": ["x"],
	}

	score = rm.compute_score(data_source=None, solution_str=solution_str, ground_truth=ground_truth)
	expected = (0.3000 - 0.3226) + (0.9500 - 0.9347) - 1.0 - 2.0
	self.assertAlmostEqual(score, expected, places=4)

	def test_invalid_json_returns_minus_five(self):
	rm = load_reward_mock_module()

	ground_truth = {
	"summary_subclaims": ["a"],
	"fulltext_subclaims": ["x"],
	}

	score = rm.compute_score(data_source=None, solution_str="not a json", ground_truth=ground_truth)
	self.assertEqual(score, -5.0)

	def test_missing_claims_returns_zero(self):
	rm = load_reward_mock_module()

	solution_str = '{"low_health_literacy": "LOW", "intermediate_health_literacy": "INTER", "proficient_health_literacy": "PRO"}'

	# Missing subclaims => early return 0.0
	score = rm.compute_score(
	data_source=None,
	solution_str=solution_str,
	ground_truth={"summary_subclaims": [], "fulltext_subclaims": ["x"]},
	)
	self.assertEqual(score, 0.0)


	if __name__ == "__main__":
	unittest.main(verbosity=2)