"""Minimal, offline tests for reward_mock.py. Run: python code/RL_model/unsloth_rl/test_reward_mock_unittest.py These tests avoid real OpenAI calls by: - mocking the API key file read - stubbing OpenAI client construction - overriding verifier.evaluate_level to deterministic outputs """ from __future__ import annotations import importlib.util import sys import types import unittest from pathlib import Path from unittest.mock import mock_open, patch THIS_DIR = Path(__file__).resolve().parent REWARD_MOCK_PATH = THIS_DIR / "reward_mock.py" class FakeOpenAI: def __init__(self, api_key: str | None = None, **_kwargs): self.api_key = api_key def load_reward_mock_module(): """Load reward_mock.py from its file path under test-friendly patches.""" module_name = "reward_mock_under_test" if module_name in sys.modules: del sys.modules[module_name] spec = importlib.util.spec_from_file_location(module_name, str(REWARD_MOCK_PATH)) if spec is None or spec.loader is None: raise RuntimeError(f"Failed to create import spec for {REWARD_MOCK_PATH}") module = importlib.util.module_from_spec(spec) # Ensure 'openai' import is available and OpenAI ctor is patched. # reward_mock does: `from openai import OpenAI` with patch("builtins.open", mock_open(read_data='{"openai": "sk-test"}')): with patch("openai.OpenAI", FakeOpenAI): spec.loader.exec_module(module) sys.modules[module_name] = module return module class TestRewardMockComputeScore(unittest.TestCase): def test_valid_json_progression_no_hierarchy_penalty(self): rm = load_reward_mock_module() def fake_evaluate_level(gen_text, gold_subs, full_subs): # Return (comp, cov) deterministically based on the generated text. if gen_text == "LOW": return 1.0, 0.3000 if gen_text == "INTER": return 1.0, 0.4000 if gen_text == "PRO": return 1.0, 0.9500 return 0.0, 0.0 rm.verifier.evaluate_level = fake_evaluate_level solution_str = """```json { "low_health_literacy": "LOW", "intermediate_health_literacy": "INTER", "proficient_health_literacy": "PRO" } ```""" ground_truth = { "summary_subclaims": ["a", "b"], "fulltext_subclaims": ["x", "y", "z"], } score = rm.compute_score(data_source=None, solution_str=solution_str, ground_truth=ground_truth) # comp thresholds are 1.0 -> comp deltas = 0 # cov deltas: (0.3000-0.3226) + (0.4000-0.4091) + (0.9500-0.9347) = -0.0164 self.assertAlmostEqual(score, -0.0164, places=4) def test_missing_field_penalizes_and_triggers_hierarchy_penalty(self): rm = load_reward_mock_module() def fake_evaluate_level(gen_text, gold_subs, full_subs): if gen_text == "LOW": return 1.0, 0.3000 if gen_text == "PRO": return 1.0, 0.9500 return 0.0, 0.0 rm.verifier.evaluate_level = fake_evaluate_level # intermediate is missing => -1.0 # BUT its cov will be 0.0 for the hierarchy check, so low_cov(0.3) <= int_cov(0.0) fails => -2.0 solution_str = '{"low_health_literacy": "LOW", "proficient_health_literacy": "PRO"}' ground_truth = { "summary_subclaims": ["a"], "fulltext_subclaims": ["x"], } score = rm.compute_score(data_source=None, solution_str=solution_str, ground_truth=ground_truth) expected = (0.3000 - 0.3226) + (0.9500 - 0.9347) - 1.0 - 2.0 self.assertAlmostEqual(score, expected, places=4) def test_invalid_json_returns_minus_five(self): rm = load_reward_mock_module() ground_truth = { "summary_subclaims": ["a"], "fulltext_subclaims": ["x"], } score = rm.compute_score(data_source=None, solution_str="not a json", ground_truth=ground_truth) self.assertEqual(score, -5.0) def test_missing_claims_returns_zero(self): rm = load_reward_mock_module() solution_str = '{"low_health_literacy": "LOW", "intermediate_health_literacy": "INTER", "proficient_health_literacy": "PRO"}' # Missing subclaims => early return 0.0 score = rm.compute_score( data_source=None, solution_str=solution_str, ground_truth={"summary_subclaims": [], "fulltext_subclaims": ["x"]}, ) self.assertEqual(score, 0.0) if __name__ == "__main__": unittest.main(verbosity=2)