| | import json |
| | import concurrent.futures |
| | from unittest.mock import MagicMock |
| |
|
| | |
| |
|
| | class MedicalClaimVerifier: |
| | def __init__(self, mock_mode=False): |
| | self.thresholds = { |
| | "low": {"comp": 0.6107, "cov": 0.3723}, |
| | "intermediate": {"comp": 0.8199, "cov": 0.6611}, |
| | "proficient": {"comp": 0.9569, "cov": 0.9069} |
| | } |
| | self.mock_mode = mock_mode |
| | |
| | if not mock_mode: |
| | from openai import OpenAI |
| | self.api_url = "http://172.16.34.29:8004/v1" |
| | self.client = OpenAI(base_url=self.api_url, api_key="EMPTY") |
| | self.model_name = "qwen3-32b-readctrl" |
| |
|
| | def get_audit_prompt(self, literacy_level): |
| | level_guidelines = { |
| | "low_health_literacy": """ |
| | Level: Low Health Literacy (High Readability) |
| | Target: Individuals needing simple terms. |
| | Goal: 'Living room' language. Replace jargon (e.g., 'renal' -> 'kidney'). |
| | Density: Strictly 'need-to-know' info from Gold Summary. |
| | Strategy: High paraphrasing, analogies, one idea per sentence. |
| | Faithfulness: Must align with Gold Summary.""", |
| | |
| | "intermediate_health_literacy": """ |
| | Level: Intermediate Health Literacy (Medium Readability) |
| | Target: General public. |
| | Goal: Standard vocabulary. Common medical terms okay; technical speak simplified. |
| | Density: Balanced. Use Gold Summary as lead, supplemented by context from Source. |
| | Strategy: Moderate paraphrasing. Remove minor technical details. |
| | Faithfulness: Maintain main narrative of Gold Summary.""", |
| | |
| | "proficient_health_literacy": """ |
| | Level: Proficient Health Literacy (Low Readability) |
| | Target: Researchers/Clinicians. |
| | Goal: Technical/Academic. Prioritize clinical nuance and accuracy. |
| | Density: High. Include data, physiological mechanisms, and statistics from Source. |
| | Strategy: Minimal paraphrasing. Retain original technical terminology. |
| | Faithfulness: Adhere to Source Text; add deeper scientific context.""" |
| | } |
| |
|
| | guidelines = level_guidelines.get(literacy_level, "Follow standard medical audit practices.") |
| | level_desc = literacy_level.replace("_", " ") |
| |
|
| | base_instructions = f""" |
| | ### Literacy Level Context: |
| | {guidelines} |
| | |
| | ### Task Instructions:""" |
| | return base_instructions |
| |
|
| | def get_completeness_prompt(self, generated_text, source_subclaim, literacy_level): |
| | base_instructions = self.get_audit_prompt(literacy_level) |
| | level_desc = literacy_level.replace("_", " ") |
| | return f"""{base_instructions} |
| | 1. Determine whether this Fact from the Gold Standard is covered in the {level_desc} summary. |
| | 2. Mark 'supported' ONLY IF: |
| | - The fact is explicitly stated in the summary, OR |
| | - The fact is clearly paraphrased or simplified in a way that preserves its meaning. |
| | 3. Do NOT mark 'supported' based solely on omission. |
| | - Absence of mention does NOT imply intentional exclusion. |
| | - Negative or exclusionary facts (e.g., "no complications," "no family history," "no systemic signs") must be explicitly conveyed. |
| | 4. Mark 'not_supported' if: |
| | - The fact is completely omitted, OR |
| | - The summary discusses related information but does not confirm the specific fact. |
| | 5. Literacy-based simplification is allowed, but factual meaning must be preserved. |
| | |
| | SUMMARY: {generated_text} |
| | FACT: {source_subclaim} |
| | |
| | output: 'supported' or 'not_supported'. |
| | """ |
| |
|
| | def get_source_coverage_prompt(self, generated_text, source_subclaim, literacy_level): |
| | base_instructions = self.get_audit_prompt(literacy_level) |
| | level_desc = literacy_level.replace("_", " ") |
| | return f"""{base_instructions} |
| | 1. Check whether the following Fact from the ORIGINAL Source Text is explicitly covered in the generated {level_desc} summary. |
| | 2. Mark 'supported' ONLY IF: |
| | - The summary clearly states the fact, OR |
| | - The fact is conveyed through an explicit paraphrase or simplification that preserves its meaning. |
| | 3. Do NOT infer support from silence or omission. |
| | - Absence of mention does NOT count as support. |
| | - Especially for negative or exclusionary facts (e.g., "no family history," "no extra-renal signs," "no complications"), the summary must explicitly indicate absence. |
| | 4. Mark 'not_supported' if: |
| | - The summary omits the fact entirely, OR |
| | - The summary discusses related topics but does not clearly confirm the specific fact. |
| | 5. Simplification for literacy level is allowed, but factual meaning must be preserved. |
| | |
| | GENERATED SUMMARY: {generated_text} |
| | SOURCE FACT: {source_subclaim} |
| | |
| | output: 'supported' or 'not_supported'.""" |
| |
|
| | def check_support_api(self, prompt): |
| | |
| | |
| | |
| | try: |
| | response = self.client.chat.completions.create( |
| | model=self.model_name, |
| | messages=[{"role": "user", "content": prompt}], |
| | max_tokens=300, temperature=0.1, |
| | ) |
| | res = response.choices[0].message.content.strip().lower() |
| | print(f"Response Received:\n{res}\n") |
| | return 1.0 if "supported" in res and "not_supported" not in res else 0.0 |
| | except: |
| | return 0.0 |
| |
|
| | def evaluate_level(self, gen_text, gold_subs, full_subs, level_key): |
| | if not gen_text: return 0.0, 0.0 |
| | |
| | |
| | with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor: |
| | comp_prompts = [self.get_completeness_prompt(gen_text, s, level_key) for s in gold_subs] |
| | comp_results = list(executor.map(self.check_support_api, comp_prompts)) |
| | comp_score = sum(comp_results) / len(comp_results) if comp_results else 0.0 |
| |
|
| | cov_prompts = [self.get_source_coverage_prompt(gen_text, s, level_key) for s in full_subs] |
| | cov_results = list(executor.map(self.check_support_api, cov_prompts)) |
| | cov_score = sum(cov_results) / len(cov_results) if cov_results else 0.0 |
| |
|
| | return comp_score, cov_score |
| |
|
| | def get_reward_score(self, completion, gold_subs, full_subs): |
| | data = None |
| | try: |
| | |
| | text = completion[0]['content'].strip() |
| | |
| | if "```json" in text: |
| | text = text.split("```json")[-1].split("```")[0].strip() |
| | elif "```" in text: |
| | text = text.split("```")[-1].split("```")[0].strip() |
| | |
| | if "<SOLUTION>" in text: |
| | text = text.split("<SOLUTION>")[-1].split("</SOLUTION>")[0].strip() |
| | |
| | data = json.loads(text) |
| | except Exception as e: |
| | print(f"JSON Parse Error: {e}") |
| | return -5.0 |
| |
|
| | levels = ["low", "intermediate", "proficient"] |
| | if not all(f"{lvl}_health_literacy" in data for lvl in levels): |
| | return -2.0 |
| |
|
| | try: |
| | total_reward = 0.0 |
| | print("\n--- Evaluation Breakdown ---") |
| | for lvl in levels: |
| | gen_text = data.get(f"{lvl}_health_literacy", "") |
| | comp_score, cov_score = self.evaluate_level(gen_text, gold_subs, full_subs, f"{lvl}_health_literacy") |
| | |
| | |
| | comp_passed = comp_score >= self.thresholds[lvl]["comp"] |
| | cov_passed = cov_score >= self.thresholds[lvl]["cov"] |
| | |
| | total_reward += 1.0 if comp_passed else -0.5 |
| | total_reward += 1.0 if cov_passed else -0.5 |
| | |
| | print(f"[{lvl.upper()}] Comp: {comp_score:.2f} ({comp_passed}), Cov: {cov_score:.2f} ({cov_passed})") |
| | |
| | return total_reward |
| | except Exception as e: |
| | print(f"Scoring Error: {e}") |
| | return -5.0 |
| |
|
| | |
| |
|
| | if __name__ == "__main__": |
| | verifier = MedicalClaimVerifier(mock_mode=False) |
| |
|
| | |
| | pass_completion = [{ |
| | "content": """ |
| | <SOLUTION> |
| | { |
| | "low_health_literacy": "This medicine makes it easier for your heart to pump and relaxes your blood tubes. You might feel dizzy if you stand up too fast.", |
| | "intermediate_health_literacy": "ACE inhibitors like Lisinopril relax blood vessels to improve flow and lower heart attack risk. Side effects include low blood pressure.", |
| | "proficient_health_literacy": "ACE inhibitors attenuate the effects of stress hormones on the myocardium while inducing vasodilation to reduce afterload and prevent myocardial infarction." |
| | } |
| | </SOLUTION> |
| | """ |
| | }] |
| |
|
| | |
| | gold_subs = [ |
| | "ACE inhibitors help the heart pump better.", |
| | "These medicines relax blood vessels.", |
| | "Common side effects include dizziness and low blood pressure." |
| | ] |
| |
|
| | |
| | full_subs = [ |
| | "Lisinopril is an example of an ACE inhibitor.", |
| | "ACE inhibitors lower the risk of a heart attack.", |
| | "The medication prevents stress hormones from damaging the heart.", |
| | "Patients should stand up slowly to avoid dizziness." |
| | ] |
| |
|
| | |
| | print("Starting Demo Run...") |
| | final_reward = verifier.get_reward_score(pass_completion, gold_subs, full_subs) |
| | |
| | print("-" * 30) |
| | print(f"FINAL REWARD SCORE: {final_reward}") |