readCtrl_lambda / code /RL_model /unsloth_rl /test_reward_mock_unittest.py
mshahidul
Initial commit of readCtrl code without large models
030876e
"""Minimal, offline tests for reward_mock.py.
Run:
python code/RL_model/unsloth_rl/test_reward_mock_unittest.py
These tests avoid real OpenAI calls by:
- mocking the API key file read
- stubbing OpenAI client construction
- overriding verifier.evaluate_level to deterministic outputs
"""
from __future__ import annotations
import importlib.util
import sys
import types
import unittest
from pathlib import Path
from unittest.mock import mock_open, patch
THIS_DIR = Path(__file__).resolve().parent
REWARD_MOCK_PATH = THIS_DIR / "reward_mock.py"
class FakeOpenAI:
def __init__(self, api_key: str | None = None, **_kwargs):
self.api_key = api_key
def load_reward_mock_module():
"""Load reward_mock.py from its file path under test-friendly patches."""
module_name = "reward_mock_under_test"
if module_name in sys.modules:
del sys.modules[module_name]
spec = importlib.util.spec_from_file_location(module_name, str(REWARD_MOCK_PATH))
if spec is None or spec.loader is None:
raise RuntimeError(f"Failed to create import spec for {REWARD_MOCK_PATH}")
module = importlib.util.module_from_spec(spec)
# Ensure 'openai' import is available and OpenAI ctor is patched.
# reward_mock does: `from openai import OpenAI`
with patch("builtins.open", mock_open(read_data='{"openai": "sk-test"}')):
with patch("openai.OpenAI", FakeOpenAI):
spec.loader.exec_module(module)
sys.modules[module_name] = module
return module
class TestRewardMockComputeScore(unittest.TestCase):
def test_valid_json_progression_no_hierarchy_penalty(self):
rm = load_reward_mock_module()
def fake_evaluate_level(gen_text, gold_subs, full_subs):
# Return (comp, cov) deterministically based on the generated text.
if gen_text == "LOW":
return 1.0, 0.3000
if gen_text == "INTER":
return 1.0, 0.4000
if gen_text == "PRO":
return 1.0, 0.9500
return 0.0, 0.0
rm.verifier.evaluate_level = fake_evaluate_level
solution_str = """```json
{
"low_health_literacy": "LOW",
"intermediate_health_literacy": "INTER",
"proficient_health_literacy": "PRO"
}
```"""
ground_truth = {
"summary_subclaims": ["a", "b"],
"fulltext_subclaims": ["x", "y", "z"],
}
score = rm.compute_score(data_source=None, solution_str=solution_str, ground_truth=ground_truth)
# comp thresholds are 1.0 -> comp deltas = 0
# cov deltas: (0.3000-0.3226) + (0.4000-0.4091) + (0.9500-0.9347) = -0.0164
self.assertAlmostEqual(score, -0.0164, places=4)
def test_missing_field_penalizes_and_triggers_hierarchy_penalty(self):
rm = load_reward_mock_module()
def fake_evaluate_level(gen_text, gold_subs, full_subs):
if gen_text == "LOW":
return 1.0, 0.3000
if gen_text == "PRO":
return 1.0, 0.9500
return 0.0, 0.0
rm.verifier.evaluate_level = fake_evaluate_level
# intermediate is missing => -1.0
# BUT its cov will be 0.0 for the hierarchy check, so low_cov(0.3) <= int_cov(0.0) fails => -2.0
solution_str = '{"low_health_literacy": "LOW", "proficient_health_literacy": "PRO"}'
ground_truth = {
"summary_subclaims": ["a"],
"fulltext_subclaims": ["x"],
}
score = rm.compute_score(data_source=None, solution_str=solution_str, ground_truth=ground_truth)
expected = (0.3000 - 0.3226) + (0.9500 - 0.9347) - 1.0 - 2.0
self.assertAlmostEqual(score, expected, places=4)
def test_invalid_json_returns_minus_five(self):
rm = load_reward_mock_module()
ground_truth = {
"summary_subclaims": ["a"],
"fulltext_subclaims": ["x"],
}
score = rm.compute_score(data_source=None, solution_str="not a json", ground_truth=ground_truth)
self.assertEqual(score, -5.0)
def test_missing_claims_returns_zero(self):
rm = load_reward_mock_module()
solution_str = '{"low_health_literacy": "LOW", "intermediate_health_literacy": "INTER", "proficient_health_literacy": "PRO"}'
# Missing subclaims => early return 0.0
score = rm.compute_score(
data_source=None,
solution_str=solution_str,
ground_truth={"summary_subclaims": [], "fulltext_subclaims": ["x"]},
)
self.assertEqual(score, 0.0)
if __name__ == "__main__":
unittest.main(verbosity=2)