| import os |
| import json |
| import tqdm |
| import argparse |
| import re |
| from openai import OpenAI |
|
|
| |
| |
| |
| API_URL = "http://172.16.34.29:8004/v1" |
| API_KEY = "EMPTY" |
| MODEL_NAME = "Qwen/Qwen3-30B-A3B-Instruct-2507" |
|
|
| client = OpenAI(base_url=API_URL, api_key=API_KEY) |
|
|
| |
| |
| |
| def get_audit_prompt(task_type, reference_text, subclaim, literacy_level): |
| level_guidelines = { |
| "low_health_literacy": """ |
| Level: Low Health Literacy (High Readability) |
| Target: Individuals needing simple terms. |
| Goal: 'Living room' language. Replace jargon (e.g., 'renal' -> 'kidney'). |
| Density: Strictly 'need-to-know' info from Gold Summary. |
| Strategy: High paraphrasing, analogies, one idea per sentence. |
| Faithfulness: Must align with Gold Summary.""", |
| |
| "intermediate_health_literacy": """ |
| Level: Intermediate Health Literacy (Medium Readability) |
| Target: General public. |
| Goal: Standard vocabulary. Common medical terms okay; technical speak simplified. |
| Density: Balanced. Use Gold Summary as lead, supplemented by context from Source. |
| Strategy: Moderate paraphrasing. Remove minor technical details. |
| Faithfulness: Maintain main narrative of Gold Summary.""", |
| |
| "proficient_health_literacy": """ |
| Level: Proficient Health Literacy (Low Readability) |
| Target: Researchers/Clinicians. |
| Goal: Technical/Academic. Prioritize clinical nuance and accuracy. |
| Density: High. Include data, physiological mechanisms, and statistics from Source. |
| Strategy: Minimal paraphrasing. Retain original technical terminology. |
| Faithfulness: Adhere to Source Text; add deeper scientific context.""" |
| } |
|
|
| guidelines = level_guidelines.get(literacy_level, "Follow standard medical audit practices.") |
| level_desc = literacy_level.replace("_", " ") |
|
|
| base_instructions = f""" |
| ### Literacy Level Context: |
| {guidelines} |
| |
| ### Task Instructions:""" |
|
|
| |
| |
| |
| |
| |
| |
| |
| if task_type == "attribution": |
| return f"""{base_instructions} |
| 1. Compare the Subclaim against the Source Text. |
| 2. Mark 'supported' ONLY IF: |
| - The Source Text explicitly states the claim, OR |
| - The claim is clearly conveyed through a faithful paraphrase that preserves its meaning. |
| 3. Do NOT infer support from silence, omission, or related but non-equivalent statements. |
| 4. For negative or exclusionary claims (e.g., "no complications," "no family history," "absence of signs"), |
| the Source Text must explicitly indicate absence. |
| 5. Mark 'not_supported' if: |
| - The claim is missing, OR |
| - The Source discusses a related concept but does not confirm the specific claim. |
| |
| SOURCE: {reference_text} |
| SUBCLAIM: {subclaim} |
| |
| Provide reasoning in <reasoning> tags, then output: 'supported' or 'not_supported'. |
| """ |
|
|
|
|
| |
| |
| |
| |
| |
| |
| |
| elif task_type == "completeness": |
| return f"""{base_instructions} |
| 1. Determine whether this Fact from the Gold Standard is covered in the {level_desc} summary. |
| 2. Mark 'supported' ONLY IF: |
| - The fact is explicitly stated in the summary, OR |
| - The fact is clearly paraphrased or simplified in a way that preserves its meaning. |
| 3. Do NOT mark 'supported' based solely on omission. |
| - Absence of mention does NOT imply intentional exclusion. |
| - Negative or exclusionary facts (e.g., "no complications," "no family history," "no systemic signs") must be explicitly conveyed. |
| 4. Mark 'not_supported' if: |
| - The fact is completely omitted, OR |
| - The summary discusses related information but does not confirm the specific fact. |
| 5. Literacy-based simplification is allowed, but factual meaning must be preserved. |
| |
| SUMMARY: {reference_text} |
| FACT: {subclaim} |
| |
| Provide reasoning in <reasoning> tags, then output: 'supported' or 'not_supported'. |
| """ |
|
|
|
|
| |
| |
| |
| |
| |
| |
| |
| elif task_type == "conciseness": |
| return f"""{base_instructions} |
| 1. The Subclaim appears in the summary but NOT in the Gold Reference. |
| 2. Determine whether this addition is acceptable. |
| 3. Mark 'supported' ONLY IF: |
| - The information is a definition, clarification, or explanatory restatement |
| of concepts already present in the Gold Reference, AND |
| - It does NOT introduce new clinical findings, test results, diagnoses, |
| causes, outcomes, or exclusions. |
| 4. Do NOT mark 'supported' if the Subclaim: |
| - Adds a new medical fact not found in the Gold Reference, OR |
| - Draws clinical conclusions or inferences beyond what the source states. |
| 5. Literacy-based explanation is allowed, but factual content must remain unchanged. |
| |
| REFERENCE: {reference_text} |
| SUBCLAIM: {subclaim} |
| |
| Provide reasoning in <reasoning> tags, then output: 'supported' or 'not_supported'. |
| """ |
|
|
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| elif task_type == "source_coverage": |
| return f"""{base_instructions} |
| 1. Check whether the following Fact from the ORIGINAL Source Text is explicitly covered in the generated {level_desc} summary. |
| 2. Mark 'supported' ONLY IF: |
| - The summary clearly states the fact, OR |
| - The fact is conveyed through an explicit paraphrase or simplification that preserves its meaning. |
| 3. Do NOT infer support from silence or omission. |
| - Absence of mention does NOT count as support. |
| - Especially for negative or exclusionary facts (e.g., "no family history," "no extra-renal signs," "no complications"), the summary must explicitly indicate absence. |
| 4. Mark 'not_supported' if: |
| - The summary omits the fact entirely, OR |
| - The summary discusses related topics but does not clearly confirm the specific fact. |
| 5. Simplification for literacy level is allowed, but factual meaning must be preserved. |
| |
| GENERATED SUMMARY: {reference_text} |
| SOURCE FACT: {subclaim} |
| |
| Provide reasoning in <reasoning> tags, then output: 'supported' or 'not_supported'. |
| """ |
|
|
| |
| |
| |
| def get_reasoned_verdict(reference, statement, task_type, literacy_level): |
| prompt = get_audit_prompt(task_type, reference, statement, literacy_level) |
| try: |
| response = client.chat.completions.create( |
| model=MODEL_NAME, |
| messages=[{"role": "user", "content": prompt}], |
| temperature=0.1, |
| ) |
| content = response.choices[0].message.content |
| |
| |
| reasoning = re.search(r"<reasoning>(.*?)</reasoning>", content, re.DOTALL).group(1).strip() if "<reasoning>" in content else "N/A" |
| final_text = content.split("</reasoning>")[-1].lower() |
| |
| label = "supported" if "supported" in final_text and "not_supported" not in final_text else "not_supported" |
| return reasoning, label |
| except: |
| return "API Error", "not_supported" |
|
|
| |
| |
| |
| if __name__ == "__main__": |
| parser = argparse.ArgumentParser() |
| parser.add_argument("--eval_file", type=str, default="/home/mshahidul/readctrl/data/factual_testing/full_details_evaluation_0_20_qwen3-32B_v2.json") |
| parser.add_argument("--source_file", type=str, default="/home/mshahidul/readctrl/data/extracting_subclaim/extracted_subclaims_syn_data_with_gs_summary_en.json") |
| parser.add_argument("--save_path", type=str, default="/home/mshahidul/readctrl/data/reasoning/") |
| args = parser.parse_args() |
|
|
| os.makedirs(args.save_path, exist_ok=True) |
|
|
| with open(args.eval_file, "r") as f: eval_data = json.load(f) |
| with open(args.source_file, "r") as f: source_data = {item['index']: item for item in json.load(f)} |
|
|
| for doc in tqdm.tqdm(eval_data): |
| idx = doc['index'] |
| original = source_data.get(idx, {}) |
| |
| for level, content in doc['literacy_levels'].items(): |
| details = content['details'] |
| gen_text = original.get('diff_label_texts', {}).get(level, '') |
| |
| |
| for item in details.get('attribution', []): |
| if item['status'] == "not_supported": |
| res, lbl = get_reasoned_verdict(original.get('fulltext'), item['subclaim'], "attribution", level) |
| item.update({"reasoning": res, "status": lbl, "refined": True}) |
|
|
| |
| for item in details.get('conciseness', []): |
| if item['status'] == "not_supported": |
| res, lbl = get_reasoned_verdict(original.get('summary'), item['subclaim'], "conciseness", level) |
| item.update({"reasoning": res, "status": lbl, "refined": True}) |
|
|
| |
| for item in details.get('completeness', []): |
| if item['status'] == "not_supported": |
| res, lbl = get_reasoned_verdict(gen_text, item['source_fact'], "completeness", level) |
| item.update({"reasoning": res, "status": lbl, "refined": True}) |
|
|
| |
| for item in details.get('source_coverage', []): |
| if item['status'] == "not_supported": |
| |
| res, lbl = get_reasoned_verdict(gen_text, item['source_subclaim'], "source_coverage", level) |
| item.update({"reasoning": res, "status": lbl, "refined": True}) |
|
|
| |
| metrics = ['factual_attribution', 'conciseness', 'completeness', 'source_coverage'] |
| for m in metrics: |
| if m in details: |
| content['scores'][m] = sum(1 for x in details[m] if x['status'] == 'supported') / len(details[m]) if details[m] else 0 |
|
|
| save_path = os.path.join(args.save_path, f"REFINED_{os.path.basename(args.eval_file)}") |
| with open(save_path, "w") as f: |
| json.dump(eval_data, f, indent=2) |
| print(f"Refinement complete. Saved to {save_path}") |