| | import os |
| | import json |
| | import tqdm |
| | from openai import OpenAI |
| |
|
| | |
| | MODEL_PATH = "Qwen/Qwen3-30B-A3B-Instruct-2507" |
| | API_URL = "http://172.16.34.29:8004/v1" |
| | API_KEY = "EMPTY" |
| |
|
| | |
| | EVAL_FILE = "/home/mshahidul/readctrl/data/reasoning/REFINED_full_details_evaluation_0_20_qwen3-32B_v2.json" |
| | RAW_DATA_FILE = "/home/mshahidul/readctrl/data/extracting_subclaim/extracted_subclaims_syn_data_with_gs_summary_en.json" |
| | |
| | file_name=os.path.basename(EVAL_FILE) |
| | UPDATED_FILE = f"/home/mshahidul/readctrl/data/reasoning/reasoned_updated_results_v2_{file_name}" |
| |
|
| | client = OpenAI(base_url=API_URL, api_key=API_KEY) |
| |
|
| | |
| | |
| | |
| | def get_clinical_reasoning(source, gold, generated, subclaim, level): |
| | |
| | level_guidelines = { |
| | "low_health_literacy": """ |
| | - Goal: 'Living room' language; replace jargon (e.g., 'renal' -> 'kidney'). |
| | - Density: Focus ONLY on 'need-to-know' info from Gold Summary. |
| | - Strategy: One idea per sentence. |
| | - Reasonable Omission: Technical jargon or details NOT in the Gold Summary. |
| | """, |
| | "intermediate_health_literacy": """ |
| | - Goal: Standard vocabulary; common medical terms are okay. |
| | - Density: Gold Summary as lead + necessary Source Text context. |
| | - Strategy: Remove minor technical details to avoid overload. |
| | - Reasonable Omission: Minor technical nuances or physiological mechanisms. |
| | """, |
| | "proficient_health_literacy": """ |
| | - Goal: Technical/Academic language; prioritize clinical nuance. |
| | - Density: High; include data, mechanisms, and statistics from Full Source. |
| | - Strategy: Retain all original technical terminology. |
| | - Reasonable Omission: Almost none; should adhere closely to Full Source. |
| | """ |
| | } |
| |
|
| | guideline = level_guidelines.get(level, "Follow standard medical summarization principles.") |
| |
|
| | |
| | |
| |
|
| | |
| | |
| |
|
| | |
| | |
| | |
| | |
| | |
| |
|
| | |
| | |
| |
|
| | |
| | |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | prompt = f"""You are a clinical logic validator auditing medical text simplification. |
| | |
| | A subclaim is currently labeled 'not_supported' in the generated text. Your job is to decide whether |
| | its omission is acceptable for the target literacy level. |
| | |
| | ### Target Level Guidelines: {level} |
| | {guideline} |
| | |
| | ### Inputs: |
| | 1) Source Text (Full Paper): {source} |
| | 2) Gold Summary (Expert Reference): {gold} |
| | 3) Generated Text (Model Output): {generated} |
| | 4) Subclaim to Evaluate: {subclaim} |
| | |
| | ### Decision rules (MUST follow): |
| | A) First, determine whether the subclaim is present in or required by the Gold Summary. |
| | - If the Gold Summary includes this subclaim (or an equivalent idea), then omitting it is usually UNREASONABLE |
| | even for low health literacy, because low literacy still must retain "need-to-know" gold content. |
| | B) Check for outcome-critical content. |
| | - If the subclaim is about outcomes/prognosis (e.g., recovery, no sequelae, disability, death, major complications), |
| | treat it as clinically important. Omission is UNREASONABLE unless the Gold Summary clearly omits it and |
| | the generated text already conveys the same outcome clearly. |
| | C) Check time scope. |
| | - If the subclaim could apply only to a specific time window (e.g., "no sequelae after initial event"), |
| | infer whether the generated text covers that window. If the generated text describes later deterioration/death, |
| | do NOT assume that supports "no sequelae." If the time scope is unclear, err toward UNREASONABLE. |
| | D) Only mark REASONABLE if: |
| | - The subclaim is NOT in the Gold Summary (or is clearly non-essential there), AND |
| | - It is mainly anatomical/technical detail, jargon, or minor nuance for this literacy level, AND |
| | - Omitting it does not change the clinical interpretation. |
| | |
| | ### Output ONLY JSON: |
| | {{ |
| | "category": "reasonable" | "unreasonable", |
| | "reason": "jargon_reduction" | "detail_filtering" | "clinical_info_loss", |
| | "explanation": "One sentence justification referencing Gold Summary importance and (if relevant) time/outcome." |
| | }} |
| | JSON:""" |
| | try: |
| | response = client.chat.completions.create( |
| | model=MODEL_PATH, |
| | messages=[{"role": "user", "content": prompt}], |
| | max_tokens=250, |
| | temperature=0.1 |
| | ) |
| | content = response.choices[0].message.content.strip() |
| | if "```json" in content: |
| | content = content.split("```json")[-1].split("```")[0].strip() |
| | return json.loads(content) |
| | except: |
| | return {"category": "unreasonable", "explanation": "API parsing error"} |
| |
|
| | |
| | |
| | |
| | def process_and_update_details(): |
| | |
| | with open(EVAL_FILE, 'r') as f: |
| | eval_data = json.load(f) |
| | with open(RAW_DATA_FILE, 'r') as f: |
| | raw_lookup = {item['index']: item for item in json.load(f)} |
| |
|
| | |
| | for entry in tqdm.tqdm(eval_data, desc="Updating Subclaim Details"): |
| | idx = entry['index'] |
| | raw_item = raw_lookup.get(idx) |
| | if not raw_item: continue |
| |
|
| | source_text = raw_item['fulltext'] |
| | gold_summary = raw_item['summary'] |
| |
|
| | for level, lvl_content in entry['literacy_levels'].items(): |
| | gen_text = raw_item['diff_label_texts'].get(level, "") |
| | |
| | |
| | comp_list = lvl_content['details']['completeness'] |
| | comp_corrected = 0 |
| | for fact_obj in comp_list: |
| | if fact_obj['status'] == 'not_supported': |
| | res = get_clinical_reasoning(source=source_text, gold=gold_summary, generated=gen_text, subclaim=fact_obj['source_fact'], level=level) |
| | |
| | if res['category'] == 'reasonable': |
| | fact_obj['status'] = 'reasonable_omission' |
| | comp_corrected += 1 |
| | fact_obj['reasoning_audit'] = res |
| | else: |
| | comp_corrected += 1 |
| | lvl_content['scores']['completeness'] = comp_corrected / len(comp_list) if comp_list else 0 |
| |
|
| | |
| | sc_list = lvl_content['details']['source_coverage'] |
| | sc_corrected = 0 |
| | for sc_obj in sc_list: |
| | if sc_obj['status'] == 'not_supported': |
| | res = get_clinical_reasoning(source=source_text, gold=gold_summary, generated=gen_text, subclaim=sc_obj['source_subclaim'], level=level) |
| | |
| | if res['category'] == 'reasonable': |
| | sc_obj['status'] = 'reasonable_omission' |
| | sc_corrected += 1 |
| | sc_obj['reasoning_audit'] = res |
| | else: |
| | sc_corrected += 1 |
| | lvl_content['scores']['source_coverage'] = sc_corrected / len(sc_list) if sc_list else 0 |
| |
|
| | |
| | with open(UPDATED_FILE, 'w') as f: |
| | json.dump(eval_data, f, indent=2) |
| | print(f"\nUpdate complete. Detailed status and scores saved to: {UPDATED_FILE}") |
| |
|
| | if __name__ == "__main__": |
| | process_and_update_details() |