File size: 10,001 Bytes
c7a6fe6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 | import json
import concurrent.futures
from unittest.mock import MagicMock
# --- The Class (Modified slightly for standalone demo) ---
class MedicalClaimVerifier:
def __init__(self, mock_mode=False):
self.thresholds = {
"low": {"comp": 0.6107, "cov": 0.3723},
"intermediate": {"comp": 0.8199, "cov": 0.6611},
"proficient": {"comp": 0.9569, "cov": 0.9069}
}
self.mock_mode = mock_mode
if not mock_mode:
from openai import OpenAI
self.api_url = "http://172.16.34.29:8004/v1"
self.client = OpenAI(base_url=self.api_url, api_key="EMPTY")
self.model_name = "qwen3-32b-readctrl"
def get_audit_prompt(self, literacy_level):
level_guidelines = {
"low_health_literacy": """
Level: Low Health Literacy (High Readability)
Target: Individuals needing simple terms.
Goal: 'Living room' language. Replace jargon (e.g., 'renal' -> 'kidney').
Density: Strictly 'need-to-know' info from Gold Summary.
Strategy: High paraphrasing, analogies, one idea per sentence.
Faithfulness: Must align with Gold Summary.""",
"intermediate_health_literacy": """
Level: Intermediate Health Literacy (Medium Readability)
Target: General public.
Goal: Standard vocabulary. Common medical terms okay; technical speak simplified.
Density: Balanced. Use Gold Summary as lead, supplemented by context from Source.
Strategy: Moderate paraphrasing. Remove minor technical details.
Faithfulness: Maintain main narrative of Gold Summary.""",
"proficient_health_literacy": """
Level: Proficient Health Literacy (Low Readability)
Target: Researchers/Clinicians.
Goal: Technical/Academic. Prioritize clinical nuance and accuracy.
Density: High. Include data, physiological mechanisms, and statistics from Source.
Strategy: Minimal paraphrasing. Retain original technical terminology.
Faithfulness: Adhere to Source Text; add deeper scientific context."""
}
guidelines = level_guidelines.get(literacy_level, "Follow standard medical audit practices.")
level_desc = literacy_level.replace("_", " ")
base_instructions = f"""
### Literacy Level Context:
{guidelines}
### Task Instructions:"""
return base_instructions
def get_completeness_prompt(self, generated_text, source_subclaim, literacy_level):
base_instructions = self.get_audit_prompt(literacy_level)
level_desc = literacy_level.replace("_", " ")
return f"""{base_instructions}
1. Determine whether this Fact from the Gold Standard is covered in the {level_desc} summary.
2. Mark 'supported' ONLY IF:
- The fact is explicitly stated in the summary, OR
- The fact is clearly paraphrased or simplified in a way that preserves its meaning.
3. Do NOT mark 'supported' based solely on omission.
- Absence of mention does NOT imply intentional exclusion.
- Negative or exclusionary facts (e.g., "no complications," "no family history," "no systemic signs") must be explicitly conveyed.
4. Mark 'not_supported' if:
- The fact is completely omitted, OR
- The summary discusses related information but does not confirm the specific fact.
5. Literacy-based simplification is allowed, but factual meaning must be preserved.
SUMMARY: {generated_text}
FACT: {source_subclaim}
output: 'supported' or 'not_supported'.
"""
def get_source_coverage_prompt(self, generated_text, source_subclaim, literacy_level):
base_instructions = self.get_audit_prompt(literacy_level)
level_desc = literacy_level.replace("_", " ")
return f"""{base_instructions}
1. Check whether the following Fact from the ORIGINAL Source Text is explicitly covered in the generated {level_desc} summary.
2. Mark 'supported' ONLY IF:
- The summary clearly states the fact, OR
- The fact is conveyed through an explicit paraphrase or simplification that preserves its meaning.
3. Do NOT infer support from silence or omission.
- Absence of mention does NOT count as support.
- Especially for negative or exclusionary facts (e.g., "no family history," "no extra-renal signs," "no complications"), the summary must explicitly indicate absence.
4. Mark 'not_supported' if:
- The summary omits the fact entirely, OR
- The summary discusses related topics but does not clearly confirm the specific fact.
5. Simplification for literacy level is allowed, but factual meaning must be preserved.
GENERATED SUMMARY: {generated_text}
SOURCE FACT: {source_subclaim}
output: 'supported' or 'not_supported'."""
def check_support_api(self, prompt):
# print(f"Prompt Sent:\n{prompt}\n")
# Real logic
try:
response = self.client.chat.completions.create(
model=self.model_name,
messages=[{"role": "user", "content": prompt}],
max_tokens=300, temperature=0.1,
)
res = response.choices[0].message.content.strip().lower()
print(f"Response Received:\n{res}\n")
return 1.0 if "supported" in res and "not_supported" not in res else 0.0
except:
return 0.0
def evaluate_level(self, gen_text, gold_subs, full_subs, level_key):
if not gen_text: return 0.0, 0.0
# Using 2 workers for demo to avoid overhead
with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
comp_prompts = [self.get_completeness_prompt(gen_text, s, level_key) for s in gold_subs]
comp_results = list(executor.map(self.check_support_api, comp_prompts))
comp_score = sum(comp_results) / len(comp_results) if comp_results else 0.0
cov_prompts = [self.get_source_coverage_prompt(gen_text, s, level_key) for s in full_subs]
cov_results = list(executor.map(self.check_support_api, cov_prompts))
cov_score = sum(cov_results) / len(cov_results) if cov_results else 0.0
return comp_score, cov_score
def get_reward_score(self, completion, gold_subs, full_subs):
data = None
try:
# completion[0]['content'] structure as expected by RL frameworks
text = completion[0]['content'].strip()
if "```json" in text:
text = text.split("```json")[-1].split("```")[0].strip()
elif "```" in text:
text = text.split("```")[-1].split("```")[0].strip()
if "<SOLUTION>" in text:
text = text.split("<SOLUTION>")[-1].split("</SOLUTION>")[0].strip()
data = json.loads(text)
except Exception as e:
print(f"JSON Parse Error: {e}")
return -5.0
levels = ["low", "intermediate", "proficient"]
if not all(f"{lvl}_health_literacy" in data for lvl in levels):
return -2.0
try:
total_reward = 0.0
print("\n--- Evaluation Breakdown ---")
for lvl in levels:
gen_text = data.get(f"{lvl}_health_literacy", "")
comp_score, cov_score = self.evaluate_level(gen_text, gold_subs, full_subs, f"{lvl}_health_literacy")
# Logic check
comp_passed = comp_score >= self.thresholds[lvl]["comp"]
cov_passed = cov_score >= self.thresholds[lvl]["cov"]
total_reward += 1.0 if comp_passed else -0.5
total_reward += 1.0 if cov_passed else -0.5
print(f"[{lvl.upper()}] Comp: {comp_score:.2f} ({comp_passed}), Cov: {cov_score:.2f} ({cov_passed})")
return total_reward
except Exception as e:
print(f"Scoring Error: {e}")
return -5.0
# --- Execution Block ---
if __name__ == "__main__":
verifier = MedicalClaimVerifier(mock_mode=False)
# 1. Mock Input Data (what the model generated)
pass_completion = [{
"content": """
<SOLUTION>
{
"low_health_literacy": "This medicine makes it easier for your heart to pump and relaxes your blood tubes. You might feel dizzy if you stand up too fast.",
"intermediate_health_literacy": "ACE inhibitors like Lisinopril relax blood vessels to improve flow and lower heart attack risk. Side effects include low blood pressure.",
"proficient_health_literacy": "ACE inhibitors attenuate the effects of stress hormones on the myocardium while inducing vasodilation to reduce afterload and prevent myocardial infarction."
}
</SOLUTION>
"""
}]
# Completeness (Essential findings from a Gold Summary)
gold_subs = [
"ACE inhibitors help the heart pump better.",
"These medicines relax blood vessels.",
"Common side effects include dizziness and low blood pressure."
]
# Source Coverage (Detailed facts from the original Full Text)
full_subs = [
"Lisinopril is an example of an ACE inhibitor.",
"ACE inhibitors lower the risk of a heart attack.",
"The medication prevents stress hormones from damaging the heart.",
"Patients should stand up slowly to avoid dizziness."
]
# 3. Run Demo
print("Starting Demo Run...")
final_reward = verifier.get_reward_score(pass_completion, gold_subs, full_subs)
print("-" * 30)
print(f"FINAL REWARD SCORE: {final_reward}") |