| """Minimal, offline tests for reward_mock.py. |
| |
| Run: |
| python code/RL_model/unsloth_rl/test_reward_mock_unittest.py |
| |
| These tests avoid real OpenAI calls by: |
| - mocking the API key file read |
| - stubbing OpenAI client construction |
| - overriding verifier.evaluate_level to deterministic outputs |
| """ |
|
|
| from __future__ import annotations |
|
|
| import importlib.util |
| import sys |
| import types |
| import unittest |
| from pathlib import Path |
| from unittest.mock import mock_open, patch |
|
|
|
|
| THIS_DIR = Path(__file__).resolve().parent |
| REWARD_MOCK_PATH = THIS_DIR / "reward_mock.py" |
|
|
|
|
| class FakeOpenAI: |
| def __init__(self, api_key: str | None = None, **_kwargs): |
| self.api_key = api_key |
|
|
|
|
| def load_reward_mock_module(): |
| """Load reward_mock.py from its file path under test-friendly patches.""" |
| module_name = "reward_mock_under_test" |
| if module_name in sys.modules: |
| del sys.modules[module_name] |
|
|
| spec = importlib.util.spec_from_file_location(module_name, str(REWARD_MOCK_PATH)) |
| if spec is None or spec.loader is None: |
| raise RuntimeError(f"Failed to create import spec for {REWARD_MOCK_PATH}") |
|
|
| module = importlib.util.module_from_spec(spec) |
|
|
| |
| |
| with patch("builtins.open", mock_open(read_data='{"openai": "sk-test"}')): |
| with patch("openai.OpenAI", FakeOpenAI): |
| spec.loader.exec_module(module) |
|
|
| sys.modules[module_name] = module |
| return module |
|
|
|
|
| class TestRewardMockComputeScore(unittest.TestCase): |
| def test_valid_json_progression_no_hierarchy_penalty(self): |
| rm = load_reward_mock_module() |
|
|
| def fake_evaluate_level(gen_text, gold_subs, full_subs): |
| |
| if gen_text == "LOW": |
| return 1.0, 0.3000 |
| if gen_text == "INTER": |
| return 1.0, 0.4000 |
| if gen_text == "PRO": |
| return 1.0, 0.9500 |
| return 0.0, 0.0 |
|
|
| rm.verifier.evaluate_level = fake_evaluate_level |
|
|
| solution_str = """```json |
| { |
| "low_health_literacy": "LOW", |
| "intermediate_health_literacy": "INTER", |
| "proficient_health_literacy": "PRO" |
| } |
| ```""" |
|
|
| ground_truth = { |
| "summary_subclaims": ["a", "b"], |
| "fulltext_subclaims": ["x", "y", "z"], |
| } |
|
|
| score = rm.compute_score(data_source=None, solution_str=solution_str, ground_truth=ground_truth) |
|
|
| |
| |
| self.assertAlmostEqual(score, -0.0164, places=4) |
|
|
| def test_missing_field_penalizes_and_triggers_hierarchy_penalty(self): |
| rm = load_reward_mock_module() |
|
|
| def fake_evaluate_level(gen_text, gold_subs, full_subs): |
| if gen_text == "LOW": |
| return 1.0, 0.3000 |
| if gen_text == "PRO": |
| return 1.0, 0.9500 |
| return 0.0, 0.0 |
|
|
| rm.verifier.evaluate_level = fake_evaluate_level |
|
|
| |
| |
| solution_str = '{"low_health_literacy": "LOW", "proficient_health_literacy": "PRO"}' |
|
|
| ground_truth = { |
| "summary_subclaims": ["a"], |
| "fulltext_subclaims": ["x"], |
| } |
|
|
| score = rm.compute_score(data_source=None, solution_str=solution_str, ground_truth=ground_truth) |
| expected = (0.3000 - 0.3226) + (0.9500 - 0.9347) - 1.0 - 2.0 |
| self.assertAlmostEqual(score, expected, places=4) |
|
|
| def test_invalid_json_returns_minus_five(self): |
| rm = load_reward_mock_module() |
|
|
| ground_truth = { |
| "summary_subclaims": ["a"], |
| "fulltext_subclaims": ["x"], |
| } |
|
|
| score = rm.compute_score(data_source=None, solution_str="not a json", ground_truth=ground_truth) |
| self.assertEqual(score, -5.0) |
|
|
| def test_missing_claims_returns_zero(self): |
| rm = load_reward_mock_module() |
|
|
| solution_str = '{"low_health_literacy": "LOW", "intermediate_health_literacy": "INTER", "proficient_health_literacy": "PRO"}' |
|
|
| |
| score = rm.compute_score( |
| data_source=None, |
| solution_str=solution_str, |
| ground_truth={"summary_subclaims": [], "fulltext_subclaims": ["x"]}, |
| ) |
| self.assertEqual(score, 0.0) |
|
|
|
|
| if __name__ == "__main__": |
| unittest.main(verbosity=2) |
|
|