File size: 4,660 Bytes
030876e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
"""Minimal, offline tests for reward_mock.py.

Run:
  python code/RL_model/unsloth_rl/test_reward_mock_unittest.py

These tests avoid real OpenAI calls by:
- mocking the API key file read
- stubbing OpenAI client construction
- overriding verifier.evaluate_level to deterministic outputs
"""

from __future__ import annotations

import importlib.util
import sys
import types
import unittest
from pathlib import Path
from unittest.mock import mock_open, patch


THIS_DIR = Path(__file__).resolve().parent
REWARD_MOCK_PATH = THIS_DIR / "reward_mock.py"


class FakeOpenAI:
    def __init__(self, api_key: str | None = None, **_kwargs):
        self.api_key = api_key


def load_reward_mock_module():
    """Load reward_mock.py from its file path under test-friendly patches."""
    module_name = "reward_mock_under_test"
    if module_name in sys.modules:
        del sys.modules[module_name]

    spec = importlib.util.spec_from_file_location(module_name, str(REWARD_MOCK_PATH))
    if spec is None or spec.loader is None:
        raise RuntimeError(f"Failed to create import spec for {REWARD_MOCK_PATH}")

    module = importlib.util.module_from_spec(spec)

    # Ensure 'openai' import is available and OpenAI ctor is patched.
    # reward_mock does: `from openai import OpenAI`
    with patch("builtins.open", mock_open(read_data='{"openai": "sk-test"}')):
        with patch("openai.OpenAI", FakeOpenAI):
            spec.loader.exec_module(module)

    sys.modules[module_name] = module
    return module


class TestRewardMockComputeScore(unittest.TestCase):
    def test_valid_json_progression_no_hierarchy_penalty(self):
        rm = load_reward_mock_module()

        def fake_evaluate_level(gen_text, gold_subs, full_subs):
            # Return (comp, cov) deterministically based on the generated text.
            if gen_text == "LOW":
                return 1.0, 0.3000
            if gen_text == "INTER":
                return 1.0, 0.4000
            if gen_text == "PRO":
                return 1.0, 0.9500
            return 0.0, 0.0

        rm.verifier.evaluate_level = fake_evaluate_level

        solution_str = """```json
        {
          "low_health_literacy": "LOW",
          "intermediate_health_literacy": "INTER",
          "proficient_health_literacy": "PRO"
        }
        ```"""

        ground_truth = {
            "summary_subclaims": ["a", "b"],
            "fulltext_subclaims": ["x", "y", "z"],
        }

        score = rm.compute_score(data_source=None, solution_str=solution_str, ground_truth=ground_truth)

        # comp thresholds are 1.0 -> comp deltas = 0
        # cov deltas: (0.3000-0.3226) + (0.4000-0.4091) + (0.9500-0.9347) = -0.0164
        self.assertAlmostEqual(score, -0.0164, places=4)

    def test_missing_field_penalizes_and_triggers_hierarchy_penalty(self):
        rm = load_reward_mock_module()

        def fake_evaluate_level(gen_text, gold_subs, full_subs):
            if gen_text == "LOW":
                return 1.0, 0.3000
            if gen_text == "PRO":
                return 1.0, 0.9500
            return 0.0, 0.0

        rm.verifier.evaluate_level = fake_evaluate_level

        # intermediate is missing => -1.0
        # BUT its cov will be 0.0 for the hierarchy check, so low_cov(0.3) <= int_cov(0.0) fails => -2.0
        solution_str = '{"low_health_literacy": "LOW", "proficient_health_literacy": "PRO"}'

        ground_truth = {
            "summary_subclaims": ["a"],
            "fulltext_subclaims": ["x"],
        }

        score = rm.compute_score(data_source=None, solution_str=solution_str, ground_truth=ground_truth)
        expected = (0.3000 - 0.3226) + (0.9500 - 0.9347) - 1.0 - 2.0
        self.assertAlmostEqual(score, expected, places=4)

    def test_invalid_json_returns_minus_five(self):
        rm = load_reward_mock_module()

        ground_truth = {
            "summary_subclaims": ["a"],
            "fulltext_subclaims": ["x"],
        }

        score = rm.compute_score(data_source=None, solution_str="not a json", ground_truth=ground_truth)
        self.assertEqual(score, -5.0)

    def test_missing_claims_returns_zero(self):
        rm = load_reward_mock_module()

        solution_str = '{"low_health_literacy": "LOW", "intermediate_health_literacy": "INTER", "proficient_health_literacy": "PRO"}'

        # Missing subclaims => early return 0.0
        score = rm.compute_score(
            data_source=None,
            solution_str=solution_str,
            ground_truth={"summary_subclaims": [], "fulltext_subclaims": ["x"]},
        )
        self.assertEqual(score, 0.0)


if __name__ == "__main__":
    unittest.main(verbosity=2)