File size: 10,001 Bytes
c7a6fe6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
import json
import concurrent.futures
from unittest.mock import MagicMock

# --- The Class (Modified slightly for standalone demo) ---

class MedicalClaimVerifier:
    def __init__(self, mock_mode=False):
        self.thresholds = {
            "low": {"comp": 0.6107, "cov": 0.3723},
            "intermediate": {"comp": 0.8199, "cov": 0.6611},
            "proficient": {"comp": 0.9569, "cov": 0.9069}
        }
        self.mock_mode = mock_mode
        
        if not mock_mode:
            from openai import OpenAI
            self.api_url = "http://172.16.34.29:8004/v1"
            self.client = OpenAI(base_url=self.api_url, api_key="EMPTY")
            self.model_name = "qwen3-32b-readctrl"

    def get_audit_prompt(self, literacy_level):
        level_guidelines = {
            "low_health_literacy": """
            Level: Low Health Literacy (High Readability)
            Target: Individuals needing simple terms.
            Goal: 'Living room' language. Replace jargon (e.g., 'renal' -> 'kidney').
            Density: Strictly 'need-to-know' info from Gold Summary.
            Strategy: High paraphrasing, analogies, one idea per sentence.
            Faithfulness: Must align with Gold Summary.""",
            
            "intermediate_health_literacy": """
            Level: Intermediate Health Literacy (Medium Readability)
            Target: General public.
            Goal: Standard vocabulary. Common medical terms okay; technical speak simplified.
            Density: Balanced. Use Gold Summary as lead, supplemented by context from Source.
            Strategy: Moderate paraphrasing. Remove minor technical details.
            Faithfulness: Maintain main narrative of Gold Summary.""",
            
            "proficient_health_literacy": """
            Level: Proficient Health Literacy (Low Readability)
            Target: Researchers/Clinicians.
            Goal: Technical/Academic. Prioritize clinical nuance and accuracy.
            Density: High. Include data, physiological mechanisms, and statistics from Source.
            Strategy: Minimal paraphrasing. Retain original technical terminology.
            Faithfulness: Adhere to Source Text; add deeper scientific context."""
        }

        guidelines = level_guidelines.get(literacy_level, "Follow standard medical audit practices.")
        level_desc = literacy_level.replace("_", " ")

        base_instructions = f"""
        ### Literacy Level Context:
        {guidelines}

        ### Task Instructions:"""
        return base_instructions

    def get_completeness_prompt(self, generated_text, source_subclaim, literacy_level):
        base_instructions = self.get_audit_prompt(literacy_level)
        level_desc = literacy_level.replace("_", " ")
        return f"""{base_instructions}
            1. Determine whether this Fact from the Gold Standard is covered in the {level_desc} summary.
            2. Mark 'supported' ONLY IF:
            - The fact is explicitly stated in the summary, OR
            - The fact is clearly paraphrased or simplified in a way that preserves its meaning.
            3. Do NOT mark 'supported' based solely on omission.
            - Absence of mention does NOT imply intentional exclusion.
            - Negative or exclusionary facts (e.g., "no complications," "no family history," "no systemic signs") must be explicitly conveyed.
            4. Mark 'not_supported' if:
            - The fact is completely omitted, OR
            - The summary discusses related information but does not confirm the specific fact.
            5. Literacy-based simplification is allowed, but factual meaning must be preserved.

            SUMMARY: {generated_text}
            FACT: {source_subclaim}

            output: 'supported' or 'not_supported'.
            """

    def get_source_coverage_prompt(self, generated_text, source_subclaim, literacy_level):
        base_instructions = self.get_audit_prompt(literacy_level)
        level_desc = literacy_level.replace("_", " ")
        return f"""{base_instructions}
        1. Check whether the following Fact from the ORIGINAL Source Text is explicitly covered in the generated {level_desc} summary.
        2. Mark 'supported' ONLY IF:
        - The summary clearly states the fact, OR
        - The fact is conveyed through an explicit paraphrase or simplification that preserves its meaning.
        3. Do NOT infer support from silence or omission.
        - Absence of mention does NOT count as support.
        - Especially for negative or exclusionary facts (e.g., "no family history," "no extra-renal signs," "no complications"), the summary must explicitly indicate absence.
        4. Mark 'not_supported' if:
        - The summary omits the fact entirely, OR
        - The summary discusses related topics but does not clearly confirm the specific fact.
        5. Simplification for literacy level is allowed, but factual meaning must be preserved.

        GENERATED SUMMARY: {generated_text}
        SOURCE FACT: {source_subclaim}

        output: 'supported' or 'not_supported'."""

    def check_support_api(self, prompt):
        # print(f"Prompt Sent:\n{prompt}\n")
        
        # Real logic
        try:
            response = self.client.chat.completions.create(
                model=self.model_name,
                messages=[{"role": "user", "content": prompt}],
                max_tokens=300, temperature=0.1,
            )
            res = response.choices[0].message.content.strip().lower()
            print(f"Response Received:\n{res}\n")
            return 1.0 if "supported" in res and "not_supported" not in res else 0.0
        except:
            return 0.0

    def evaluate_level(self, gen_text, gold_subs, full_subs, level_key):
        if not gen_text: return 0.0, 0.0
        
        # Using 2 workers for demo to avoid overhead
        with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
            comp_prompts = [self.get_completeness_prompt(gen_text, s, level_key) for s in gold_subs]
            comp_results = list(executor.map(self.check_support_api, comp_prompts))
            comp_score = sum(comp_results) / len(comp_results) if comp_results else 0.0

            cov_prompts = [self.get_source_coverage_prompt(gen_text, s, level_key) for s in full_subs]
            cov_results = list(executor.map(self.check_support_api, cov_prompts))
            cov_score = sum(cov_results) / len(cov_results) if cov_results else 0.0

        return comp_score, cov_score

    def get_reward_score(self, completion, gold_subs, full_subs):
        data = None
        try:
            # completion[0]['content'] structure as expected by RL frameworks
            text = completion[0]['content'].strip()
            
            if "```json" in text:
                text = text.split("```json")[-1].split("```")[0].strip()
            elif "```" in text:
                text = text.split("```")[-1].split("```")[0].strip()
            
            if "<SOLUTION>" in text:
                text = text.split("<SOLUTION>")[-1].split("</SOLUTION>")[0].strip()
                
            data = json.loads(text)
        except Exception as e:
            print(f"JSON Parse Error: {e}")
            return -5.0

        levels = ["low", "intermediate", "proficient"]
        if not all(f"{lvl}_health_literacy" in data for lvl in levels):
            return -2.0

        try:
            total_reward = 0.0
            print("\n--- Evaluation Breakdown ---")
            for lvl in levels:
                gen_text = data.get(f"{lvl}_health_literacy", "")
                comp_score, cov_score = self.evaluate_level(gen_text, gold_subs, full_subs, f"{lvl}_health_literacy")
                
                # Logic check
                comp_passed = comp_score >= self.thresholds[lvl]["comp"]
                cov_passed = cov_score >= self.thresholds[lvl]["cov"]
                
                total_reward += 1.0 if comp_passed else -0.5
                total_reward += 1.0 if cov_passed else -0.5
                
                print(f"[{lvl.upper()}] Comp: {comp_score:.2f} ({comp_passed}), Cov: {cov_score:.2f} ({cov_passed})")
                
            return total_reward
        except Exception as e:
            print(f"Scoring Error: {e}")
            return -5.0

# --- Execution Block ---

if __name__ == "__main__":
    verifier = MedicalClaimVerifier(mock_mode=False)

    # 1. Mock Input Data (what the model generated)
    pass_completion = [{
    "content": """
        <SOLUTION>
        {
            "low_health_literacy": "This medicine makes it easier for your heart to pump and relaxes your blood tubes. You might feel dizzy if you stand up too fast.",
            "intermediate_health_literacy": "ACE inhibitors like Lisinopril relax blood vessels to improve flow and lower heart attack risk. Side effects include low blood pressure.",
            "proficient_health_literacy": "ACE inhibitors attenuate the effects of stress hormones on the myocardium while inducing vasodilation to reduce afterload and prevent myocardial infarction."
        }
        </SOLUTION>
        """
    }]

        # Completeness (Essential findings from a Gold Summary)
    gold_subs = [
        "ACE inhibitors help the heart pump better.",
        "These medicines relax blood vessels.",
        "Common side effects include dizziness and low blood pressure."
    ]

    # Source Coverage (Detailed facts from the original Full Text)
    full_subs = [
        "Lisinopril is an example of an ACE inhibitor.",
        "ACE inhibitors lower the risk of a heart attack.",
        "The medication prevents stress hormones from damaging the heart.",
        "Patients should stand up slowly to avoid dizziness."
    ]

    # 3. Run Demo
    print("Starting Demo Run...")
    final_reward = verifier.get_reward_score(pass_completion, gold_subs, full_subs)
    
    print("-" * 30)
    print(f"FINAL REWARD SCORE: {final_reward}")