File size: 10,001 Bytes

c7a6fe6

import json
import concurrent.futures
from unittest.mock import MagicMock

# --- The Class (Modified slightly for standalone demo) ---

class MedicalClaimVerifier:
    def __init__(self, mock_mode=False):
        self.thresholds = {
            "low": {"comp": 0.6107, "cov": 0.3723},
            "intermediate": {"comp": 0.8199, "cov": 0.6611},
            "proficient": {"comp": 0.9569, "cov": 0.9069}
        }
        self.mock_mode = mock_mode
        
        if not mock_mode:
            from openai import OpenAI
            self.api_url = "http://172.16.34.29:8004/v1"
            self.client = OpenAI(base_url=self.api_url, api_key="EMPTY")
            self.model_name = "qwen3-32b-readctrl"

    def get_audit_prompt(self, literacy_level):
        level_guidelines = {
            "low_health_literacy": """
            Level: Low Health Literacy (High Readability)
            Target: Individuals needing simple terms.
            Goal: 'Living room' language. Replace jargon (e.g., 'renal' -> 'kidney').
            Density: Strictly 'need-to-know' info from Gold Summary.
            Strategy: High paraphrasing, analogies, one idea per sentence.
            Faithfulness: Must align with Gold Summary.""",
            
            "intermediate_health_literacy": """
            Level: Intermediate Health Literacy (Medium Readability)
            Target: General public.
            Goal: Standard vocabulary. Common medical terms okay; technical speak simplified.
            Density: Balanced. Use Gold Summary as lead, supplemented by context from Source.
            Strategy: Moderate paraphrasing. Remove minor technical details.
            Faithfulness: Maintain main narrative of Gold Summary.""",
            
            "proficient_health_literacy": """
            Level: Proficient Health Literacy (Low Readability)
            Target: Researchers/Clinicians.
            Goal: Technical/Academic. Prioritize clinical nuance and accuracy.
            Density: High. Include data, physiological mechanisms, and statistics from Source.
            Strategy: Minimal paraphrasing. Retain original technical terminology.
            Faithfulness: Adhere to Source Text; add deeper scientific context."""
        }

        guidelines = level_guidelines.get(literacy_level, "Follow standard medical audit practices.")
        level_desc = literacy_level.replace("_", " ")

        base_instructions = f"""
        ### Literacy Level Context:
        {guidelines}

        ### Task Instructions:"""
        return base_instructions

    def get_completeness_prompt(self, generated_text, source_subclaim, literacy_level):
        base_instructions = self.get_audit_prompt(literacy_level)
        level_desc = literacy_level.replace("_", " ")
        return f"""{base_instructions}
            1. Determine whether this Fact from the Gold Standard is covered in the {level_desc} summary.
            2. Mark 'supported' ONLY IF:
            - The fact is explicitly stated in the summary, OR
            - The fact is clearly paraphrased or simplified in a way that preserves its meaning.
            3. Do NOT mark 'supported' based solely on omission.
            - Absence of mention does NOT imply intentional exclusion.
            - Negative or exclusionary facts (e.g., "no complications," "no family history," "no systemic signs") must be explicitly conveyed.
            4. Mark 'not_supported' if:
            - The fact is completely omitted, OR
            - The summary discusses related information but does not confirm the specific fact.
            5. Literacy-based simplification is allowed, but factual meaning must be preserved.

            SUMMARY: {generated_text}
            FACT: {source_subclaim}

            output: 'supported' or 'not_supported'.
            """

    def get_source_coverage_prompt(self, generated_text, source_subclaim, literacy_level):
        base_instructions = self.get_audit_prompt(literacy_level)
        level_desc = literacy_level.replace("_", " ")
        return f"""{base_instructions}
        1. Check whether the following Fact from the ORIGINAL Source Text is explicitly covered in the generated {level_desc} summary.
        2. Mark 'supported' ONLY IF:
        - The summary clearly states the fact, OR
        - The fact is conveyed through an explicit paraphrase or simplification that preserves its meaning.
        3. Do NOT infer support from silence or omission.
        - Absence of mention does NOT count as support.
        - Especially for negative or exclusionary facts (e.g., "no family history," "no extra-renal signs," "no complications"), the summary must explicitly indicate absence.
        4. Mark 'not_supported' if:
        - The summary omits the fact entirely, OR
        - The summary discusses related topics but does not clearly confirm the specific fact.
        5. Simplification for literacy level is allowed, but factual meaning must be preserved.

        GENERATED SUMMARY: {generated_text}
        SOURCE FACT: {source_subclaim}

        output: 'supported' or 'not_supported'."""

    def check_support_api(self, prompt):
        # print(f"Prompt Sent:\n{prompt}\n")
        
        # Real logic
        try:
            response = self.client.chat.completions.create(
                model=self.model_name,
                messages=[{"role": "user", "content": prompt}],
                max_tokens=300, temperature=0.1,
            )
            res = response.choices[0].message.content.strip().lower()
            print(f"Response Received:\n{res}\n")
            return 1.0 if "supported" in res and "not_supported" not in res else 0.0
        except:
            return 0.0

    def evaluate_level(self, gen_text, gold_subs, full_subs, level_key):
        if not gen_text: return 0.0, 0.0
        
        # Using 2 workers for demo to avoid overhead
        with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
            comp_prompts = [self.get_completeness_prompt(gen_text, s, level_key) for s in gold_subs]
            comp_results = list(executor.map(self.check_support_api, comp_prompts))
            comp_score = sum(comp_results) / len(comp_results) if comp_results else 0.0

            cov_prompts = [self.get_source_coverage_prompt(gen_text, s, level_key) for s in full_subs]
            cov_results = list(executor.map(self.check_support_api, cov_prompts))
            cov_score = sum(cov_results) / len(cov_results) if cov_results else 0.0

        return comp_score, cov_score

    def get_reward_score(self, completion, gold_subs, full_subs):
        data = None
        try:
            # completion[0]['content'] structure as expected by RL frameworks
            text = completion[0]['content'].strip()
            
            if "```json" in text:
                text = text.split("```json")[-1].split("```")[0].strip()
            elif "```" in text:
                text = text.split("```")[-1].split("```")[0].strip()
            
            if "<SOLUTION>" in text:
                text = text.split("<SOLUTION>")[-1].split("</SOLUTION>")[0].strip()
                
            data = json.loads(text)
        except Exception as e:
            print(f"JSON Parse Error: {e}")
            return -5.0

        levels = ["low", "intermediate", "proficient"]
        if not all(f"{lvl}_health_literacy" in data for lvl in levels):
            return -2.0

        try:
            total_reward = 0.0
            print("\n--- Evaluation Breakdown ---")
            for lvl in levels:
                gen_text = data.get(f"{lvl}_health_literacy", "")
                comp_score, cov_score = self.evaluate_level(gen_text, gold_subs, full_subs, f"{lvl}_health_literacy")
                
                # Logic check
                comp_passed = comp_score >= self.thresholds[lvl]["comp"]
                cov_passed = cov_score >= self.thresholds[lvl]["cov"]
                
                total_reward += 1.0 if comp_passed else -0.5
                total_reward += 1.0 if cov_passed else -0.5
                
                print(f"[{lvl.upper()}] Comp: {comp_score:.2f} ({comp_passed}), Cov: {cov_score:.2f} ({cov_passed})")
                
            return total_reward
        except Exception as e:
            print(f"Scoring Error: {e}")
            return -5.0

# --- Execution Block ---

if __name__ == "__main__":
    verifier = MedicalClaimVerifier(mock_mode=False)

    # 1. Mock Input Data (what the model generated)
    pass_completion = [{
    "content": """
        <SOLUTION>
        {
            "low_health_literacy": "This medicine makes it easier for your heart to pump and relaxes your blood tubes. You might feel dizzy if you stand up too fast.",
            "intermediate_health_literacy": "ACE inhibitors like Lisinopril relax blood vessels to improve flow and lower heart attack risk. Side effects include low blood pressure.",
            "proficient_health_literacy": "ACE inhibitors attenuate the effects of stress hormones on the myocardium while inducing vasodilation to reduce afterload and prevent myocardial infarction."
        }
        </SOLUTION>
        """
    }]

        # Completeness (Essential findings from a Gold Summary)
    gold_subs = [
        "ACE inhibitors help the heart pump better.",
        "These medicines relax blood vessels.",
        "Common side effects include dizziness and low blood pressure."
    ]

    # Source Coverage (Detailed facts from the original Full Text)
    full_subs = [
        "Lisinopril is an example of an ACE inhibitor.",
        "ACE inhibitors lower the risk of a heart attack.",
        "The medication prevents stress hormones from damaging the heart.",
        "Patients should stand up slowly to avoid dizziness."
    ]

    # 3. Run Demo
    print("Starting Demo Run...")
    final_reward = verifier.get_reward_score(pass_completion, gold_subs, full_subs)
    
    print("-" * 30)
    print(f"FINAL REWARD SCORE: {final_reward}")