File size: 4,660 Bytes
030876e | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 | """Minimal, offline tests for reward_mock.py.
Run:
python code/RL_model/unsloth_rl/test_reward_mock_unittest.py
These tests avoid real OpenAI calls by:
- mocking the API key file read
- stubbing OpenAI client construction
- overriding verifier.evaluate_level to deterministic outputs
"""
from __future__ import annotations
import importlib.util
import sys
import types
import unittest
from pathlib import Path
from unittest.mock import mock_open, patch
THIS_DIR = Path(__file__).resolve().parent
REWARD_MOCK_PATH = THIS_DIR / "reward_mock.py"
class FakeOpenAI:
def __init__(self, api_key: str | None = None, **_kwargs):
self.api_key = api_key
def load_reward_mock_module():
"""Load reward_mock.py from its file path under test-friendly patches."""
module_name = "reward_mock_under_test"
if module_name in sys.modules:
del sys.modules[module_name]
spec = importlib.util.spec_from_file_location(module_name, str(REWARD_MOCK_PATH))
if spec is None or spec.loader is None:
raise RuntimeError(f"Failed to create import spec for {REWARD_MOCK_PATH}")
module = importlib.util.module_from_spec(spec)
# Ensure 'openai' import is available and OpenAI ctor is patched.
# reward_mock does: `from openai import OpenAI`
with patch("builtins.open", mock_open(read_data='{"openai": "sk-test"}')):
with patch("openai.OpenAI", FakeOpenAI):
spec.loader.exec_module(module)
sys.modules[module_name] = module
return module
class TestRewardMockComputeScore(unittest.TestCase):
def test_valid_json_progression_no_hierarchy_penalty(self):
rm = load_reward_mock_module()
def fake_evaluate_level(gen_text, gold_subs, full_subs):
# Return (comp, cov) deterministically based on the generated text.
if gen_text == "LOW":
return 1.0, 0.3000
if gen_text == "INTER":
return 1.0, 0.4000
if gen_text == "PRO":
return 1.0, 0.9500
return 0.0, 0.0
rm.verifier.evaluate_level = fake_evaluate_level
solution_str = """```json
{
"low_health_literacy": "LOW",
"intermediate_health_literacy": "INTER",
"proficient_health_literacy": "PRO"
}
```"""
ground_truth = {
"summary_subclaims": ["a", "b"],
"fulltext_subclaims": ["x", "y", "z"],
}
score = rm.compute_score(data_source=None, solution_str=solution_str, ground_truth=ground_truth)
# comp thresholds are 1.0 -> comp deltas = 0
# cov deltas: (0.3000-0.3226) + (0.4000-0.4091) + (0.9500-0.9347) = -0.0164
self.assertAlmostEqual(score, -0.0164, places=4)
def test_missing_field_penalizes_and_triggers_hierarchy_penalty(self):
rm = load_reward_mock_module()
def fake_evaluate_level(gen_text, gold_subs, full_subs):
if gen_text == "LOW":
return 1.0, 0.3000
if gen_text == "PRO":
return 1.0, 0.9500
return 0.0, 0.0
rm.verifier.evaluate_level = fake_evaluate_level
# intermediate is missing => -1.0
# BUT its cov will be 0.0 for the hierarchy check, so low_cov(0.3) <= int_cov(0.0) fails => -2.0
solution_str = '{"low_health_literacy": "LOW", "proficient_health_literacy": "PRO"}'
ground_truth = {
"summary_subclaims": ["a"],
"fulltext_subclaims": ["x"],
}
score = rm.compute_score(data_source=None, solution_str=solution_str, ground_truth=ground_truth)
expected = (0.3000 - 0.3226) + (0.9500 - 0.9347) - 1.0 - 2.0
self.assertAlmostEqual(score, expected, places=4)
def test_invalid_json_returns_minus_five(self):
rm = load_reward_mock_module()
ground_truth = {
"summary_subclaims": ["a"],
"fulltext_subclaims": ["x"],
}
score = rm.compute_score(data_source=None, solution_str="not a json", ground_truth=ground_truth)
self.assertEqual(score, -5.0)
def test_missing_claims_returns_zero(self):
rm = load_reward_mock_module()
solution_str = '{"low_health_literacy": "LOW", "intermediate_health_literacy": "INTER", "proficient_health_literacy": "PRO"}'
# Missing subclaims => early return 0.0
score = rm.compute_score(
data_source=None,
solution_str=solution_str,
ground_truth={"summary_subclaims": [], "fulltext_subclaims": ["x"]},
)
self.assertEqual(score, 0.0)
if __name__ == "__main__":
unittest.main(verbosity=2)
|