import os import json import tqdm import argparse import re from openai import OpenAI # ----------------------------- # CONFIGURATION # ----------------------------- API_URL = "http://172.16.34.29:8004/v1" API_KEY = "EMPTY" MODEL_NAME = "Qwen/Qwen3-30B-A3B-Instruct-2507" client = OpenAI(base_url=API_URL, api_key=API_KEY) # ----------------------------- # REASONING PROMPTS # ----------------------------- def get_audit_prompt(task_type, reference_text, subclaim, literacy_level): level_guidelines = { "low_health_literacy": """ Level: Low Health Literacy (High Readability) Target: Individuals needing simple terms. Goal: 'Living room' language. Replace jargon (e.g., 'renal' -> 'kidney'). Density: Strictly 'need-to-know' info from Gold Summary. Strategy: High paraphrasing, analogies, one idea per sentence. Faithfulness: Must align with Gold Summary.""", "intermediate_health_literacy": """ Level: Intermediate Health Literacy (Medium Readability) Target: General public. Goal: Standard vocabulary. Common medical terms okay; technical speak simplified. Density: Balanced. Use Gold Summary as lead, supplemented by context from Source. Strategy: Moderate paraphrasing. Remove minor technical details. Faithfulness: Maintain main narrative of Gold Summary.""", "proficient_health_literacy": """ Level: Proficient Health Literacy (Low Readability) Target: Researchers/Clinicians. Goal: Technical/Academic. Prioritize clinical nuance and accuracy. Density: High. Include data, physiological mechanisms, and statistics from Source. Strategy: Minimal paraphrasing. Retain original technical terminology. Faithfulness: Adhere to Source Text; add deeper scientific context.""" } guidelines = level_guidelines.get(literacy_level, "Follow standard medical audit practices.") level_desc = literacy_level.replace("_", " ") base_instructions = f""" ### Literacy Level Context: {guidelines} ### Task Instructions:""" if task_type == "attribution": return f"""{base_instructions} 1. Compare the Subclaim against the Source Text. 2. Flag as 'supported' if the Source contains this claim, even if highly paraphrased for {level_desc}. SOURCE: {reference_text} SUBCLAIM: {subclaim} Provide reasoning in tags, then output: 'supported' or 'not_supported'.""" elif task_type == "completeness": return f"""{base_instructions} 1. Is this Fact from the Gold Standard missing from the {level_desc} summary? 2. Mark 'supported' if: The info is present (paraphrased) OR if the info was omitted because it is too complex for {level_desc} guidelines. SUMMARY: {reference_text} FACT: {subclaim} Provide reasoning in tags, then output: 'supported' or 'not_supported'.""" elif task_type == "conciseness": return f"""{base_instructions} 1. The Subclaim exists in the summary but NOT in the Gold Reference. Is this okay? 2. Mark 'supported' if: The info adds necessary definitions or scientific depth appropriate for {level_desc}. REFERENCE: {reference_text} SUBCLAIM: {subclaim} Provide reasoning in tags, then output: 'supported' or 'not_supported'.""" # NEW: Source Coverage Prompt elif task_type == "source_coverage": return f"""{base_instructions} 1. Check if the following Fact from the ORIGINAL Source Text is covered in the generated {level_desc} summary. 2. Mark 'supported' if the summary includes this information, even if it is simplified or combined with other points. 3. Mark 'not_supported' if the summary completely omits this specific medical fact. GENERATED SUMMARY: {reference_text} SOURCE FACT: {subclaim} Provide reasoning in tags, then output: 'supported' or 'not_supported'.""" # ----------------------------- # LOGIC # ----------------------------- def get_reasoned_verdict(reference, statement, task_type, literacy_level): prompt = get_audit_prompt(task_type, reference, statement, literacy_level) try: response = client.chat.completions.create( model=MODEL_NAME, messages=[{"role": "user", "content": prompt}], temperature=0.1, ) content = response.choices[0].message.content # Extracts reasoning from tags specifically reasoning = re.search(r"(.*?)", content, re.DOTALL).group(1).strip() if "" in content else "N/A" final_text = content.split("")[-1].lower() label = "supported" if "supported" in final_text and "not_supported" not in final_text else "not_supported" return reasoning, label except: return "API Error", "not_supported" # ----------------------------- # MAIN PROCESSING # ----------------------------- if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--eval_file", type=str, default="/home/mshahidul/readctrl/data/reasoning/reasoned_updated_results_0_20.json") parser.add_argument("--source_file", type=str, default="/home/mshahidul/readctrl/data/extracting_subclaim/extracted_subclaims_syn_data_with_gs_summary_en.json") parser.add_argument("--save_path", type=str, default="/home/mshahidul/readctrl/data/reasoning/") args = parser.parse_args() os.makedirs(args.save_path, exist_ok=True) with open(args.eval_file, "r") as f: eval_data = json.load(f) with open(args.source_file, "r") as f: source_data = {item['index']: item for item in json.load(f)} for doc in tqdm.tqdm(eval_data): idx = doc['index'] original = source_data.get(idx, {}) for level, content in doc['literacy_levels'].items(): details = content['details'] gen_text = original.get('diff_label_texts', {}).get(level, '') # 1. Audit Attribution for item in details.get('attribution', []): if item['status'] == "not_supported": res, lbl = get_reasoned_verdict(original.get('fulltext'), item['subclaim'], "attribution", level) item.update({"reasoning": res, "status": lbl, "refined": True}) # 2. Audit Conciseness for item in details.get('conciseness', []): if item['status'] == "not_supported": res, lbl = get_reasoned_verdict(original.get('summary'), item['subclaim'], "conciseness", level) item.update({"reasoning": res, "status": lbl, "refined": True}) # 3. Audit Completeness # for item in details.get('completeness', []): # if item['status'] == "not_supported": # res, lbl = get_reasoned_verdict(gen_text, item['source_fact'], "completeness", level) # item.update({"reasoning": res, "status": lbl, "refined": True}) # 4. NEW: Audit Source Coverage # for item in details.get('source_coverage', []): # if item['status'] == "not_supported": # # Comparing Source Fact against the Generated Text # res, lbl = get_reasoned_verdict(gen_text, item['source_subclaim'], "source_coverage", level) # item.update({"reasoning": res, "status": lbl, "refined": True}) # Recalculate Scores metrics = ['factual_attribution', 'conciseness'] for m in metrics: if m in details: content['scores'][m] = sum(1 for x in details[m] if x['status'] == 'supported') / len(details[m]) if details[m] else 0 save_path = os.path.join(args.save_path, f"REFINED_attr_concise_{os.path.basename(args.eval_file)}") with open(save_path, "w") as f: json.dump(eval_data, f, indent=2) print(f"Refinement complete. Saved to {save_path}")