File size: 4,660 Bytes

030876e

"""Minimal, offline tests for reward_mock.py.

Run:
  python code/RL_model/unsloth_rl/test_reward_mock_unittest.py

These tests avoid real OpenAI calls by:
- mocking the API key file read
- stubbing OpenAI client construction
- overriding verifier.evaluate_level to deterministic outputs
"""

from __future__ import annotations

import importlib.util
import sys
import types
import unittest
from pathlib import Path
from unittest.mock import mock_open, patch


THIS_DIR = Path(__file__).resolve().parent
REWARD_MOCK_PATH = THIS_DIR / "reward_mock.py"


class FakeOpenAI:
    def __init__(self, api_key: str | None = None, **_kwargs):
        self.api_key = api_key


def load_reward_mock_module():
    """Load reward_mock.py from its file path under test-friendly patches."""
    module_name = "reward_mock_under_test"
    if module_name in sys.modules:
        del sys.modules[module_name]

    spec = importlib.util.spec_from_file_location(module_name, str(REWARD_MOCK_PATH))
    if spec is None or spec.loader is None:
        raise RuntimeError(f"Failed to create import spec for {REWARD_MOCK_PATH}")

    module = importlib.util.module_from_spec(spec)

    # Ensure 'openai' import is available and OpenAI ctor is patched.
    # reward_mock does: `from openai import OpenAI`
    with patch("builtins.open", mock_open(read_data='{"openai": "sk-test"}')):
        with patch("openai.OpenAI", FakeOpenAI):
            spec.loader.exec_module(module)

    sys.modules[module_name] = module
    return module


class TestRewardMockComputeScore(unittest.TestCase):
    def test_valid_json_progression_no_hierarchy_penalty(self):
        rm = load_reward_mock_module()

        def fake_evaluate_level(gen_text, gold_subs, full_subs):
            # Return (comp, cov) deterministically based on the generated text.
            if gen_text == "LOW":
                return 1.0, 0.3000
            if gen_text == "INTER":
                return 1.0, 0.4000
            if gen_text == "PRO":
                return 1.0, 0.9500
            return 0.0, 0.0

        rm.verifier.evaluate_level = fake_evaluate_level

        solution_str = """```json
        {
          "low_health_literacy": "LOW",
          "intermediate_health_literacy": "INTER",
          "proficient_health_literacy": "PRO"
        }
        ```"""

        ground_truth = {
            "summary_subclaims": ["a", "b"],
            "fulltext_subclaims": ["x", "y", "z"],
        }

        score = rm.compute_score(data_source=None, solution_str=solution_str, ground_truth=ground_truth)

        # comp thresholds are 1.0 -> comp deltas = 0
        # cov deltas: (0.3000-0.3226) + (0.4000-0.4091) + (0.9500-0.9347) = -0.0164
        self.assertAlmostEqual(score, -0.0164, places=4)

    def test_missing_field_penalizes_and_triggers_hierarchy_penalty(self):
        rm = load_reward_mock_module()

        def fake_evaluate_level(gen_text, gold_subs, full_subs):
            if gen_text == "LOW":
                return 1.0, 0.3000
            if gen_text == "PRO":
                return 1.0, 0.9500
            return 0.0, 0.0

        rm.verifier.evaluate_level = fake_evaluate_level

        # intermediate is missing => -1.0
        # BUT its cov will be 0.0 for the hierarchy check, so low_cov(0.3) <= int_cov(0.0) fails => -2.0
        solution_str = '{"low_health_literacy": "LOW", "proficient_health_literacy": "PRO"}'

        ground_truth = {
            "summary_subclaims": ["a"],
            "fulltext_subclaims": ["x"],
        }

        score = rm.compute_score(data_source=None, solution_str=solution_str, ground_truth=ground_truth)
        expected = (0.3000 - 0.3226) + (0.9500 - 0.9347) - 1.0 - 2.0
        self.assertAlmostEqual(score, expected, places=4)

    def test_invalid_json_returns_minus_five(self):
        rm = load_reward_mock_module()

        ground_truth = {
            "summary_subclaims": ["a"],
            "fulltext_subclaims": ["x"],
        }

        score = rm.compute_score(data_source=None, solution_str="not a json", ground_truth=ground_truth)
        self.assertEqual(score, -5.0)

    def test_missing_claims_returns_zero(self):
        rm = load_reward_mock_module()

        solution_str = '{"low_health_literacy": "LOW", "intermediate_health_literacy": "INTER", "proficient_health_literacy": "PRO"}'

        # Missing subclaims => early return 0.0
        score = rm.compute_score(
            data_source=None,
            solution_str=solution_str,
            ground_truth={"summary_subclaims": [], "fulltext_subclaims": ["x"]},
        )
        self.assertEqual(score, 0.0)


if __name__ == "__main__":
    unittest.main(verbosity=2)