File size: 8,695 Bytes
c7a6fe6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 | import os
import json
import tqdm
from openai import OpenAI
# --- CONFIGURATION ---
MODEL_PATH = "Qwen/Qwen3-30B-A3B-Instruct-2507"
API_URL = "http://172.16.34.29:8004/v1"
API_KEY = "EMPTY"
# Input Files
EVAL_FILE = "/home/mshahidul/readctrl/data/reasoning/REFINED_full_details_evaluation_0_20_qwen3-32B_v2.json"
RAW_DATA_FILE = "/home/mshahidul/readctrl/data/extracting_subclaim/extracted_subclaims_syn_data_with_gs_summary_en.json"
# Output File
file_name=os.path.basename(EVAL_FILE)
UPDATED_FILE = f"/home/mshahidul/readctrl/data/reasoning/reasoned_updated_results_v2_{file_name}"
client = OpenAI(base_url=API_URL, api_key=API_KEY)
# -----------------------------
# REASONING CORE
# -----------------------------
def get_clinical_reasoning(source, gold, generated, subclaim, level):
# Map your specific label info to the prompt context
level_guidelines = {
"low_health_literacy": """
- Goal: 'Living room' language; replace jargon (e.g., 'renal' -> 'kidney').
- Density: Focus ONLY on 'need-to-know' info from Gold Summary.
- Strategy: One idea per sentence.
- Reasonable Omission: Technical jargon or details NOT in the Gold Summary.
""",
"intermediate_health_literacy": """
- Goal: Standard vocabulary; common medical terms are okay.
- Density: Gold Summary as lead + necessary Source Text context.
- Strategy: Remove minor technical details to avoid overload.
- Reasonable Omission: Minor technical nuances or physiological mechanisms.
""",
"proficient_health_literacy": """
- Goal: Technical/Academic language; prioritize clinical nuance.
- Density: High; include data, mechanisms, and statistics from Full Source.
- Strategy: Retain all original technical terminology.
- Reasonable Omission: Almost none; should adhere closely to Full Source.
"""
}
guideline = level_guidelines.get(level, "Follow standard medical summarization principles.")
# prompt = f"""You are a clinical logic validator auditing medical text simplification.
# A subclaim is currently 'not_supported' in the generated text.
# ### Target Level Guidelines: {level}
# {guideline}
# ### Inputs:
# 1. Source Text (Full Paper): {source}
# 2. Gold Summary (Expert Reference): {gold}
# 3. Generated Text (Model Output): {generated}
# 4. Subclaim to Evaluate: {subclaim}
# ### Task:
# Determine if the absence of this subclaim in the Generated Text is justified based on the {level} strategy.
# - CATEGORY 'reasonable': Omission aligns with the linguistic goals (e.g., removing jargon for Low literacy or filtering minor details for Intermediate).
# - CATEGORY 'unreasonable': Omission results in clinical information loss that violates the target density (e.g., missing a diagnosis or omitting technical data for Proficient level).
# Output ONLY JSON:
# {{
# "category": "reasonable" | "unreasonable",
# "reason": "jargon_reduction" | "detail_filtering" | "clinical_info_loss",
# "explanation": "One sentence justification matching the {level} strategy."
# }}
# JSON:"""
prompt = f"""You are a clinical logic validator auditing medical text simplification.
A subclaim is currently labeled 'not_supported' in the generated text. Your job is to decide whether
its omission is acceptable for the target literacy level.
### Target Level Guidelines: {level}
{guideline}
### Inputs:
1) Source Text (Full Paper): {source}
2) Gold Summary (Expert Reference): {gold}
3) Generated Text (Model Output): {generated}
4) Subclaim to Evaluate: {subclaim}
### Decision rules (MUST follow):
A) First, determine whether the subclaim is present in or required by the Gold Summary.
- If the Gold Summary includes this subclaim (or an equivalent idea), then omitting it is usually UNREASONABLE
even for low health literacy, because low literacy still must retain "need-to-know" gold content.
B) Check for outcome-critical content.
- If the subclaim is about outcomes/prognosis (e.g., recovery, no sequelae, disability, death, major complications),
treat it as clinically important. Omission is UNREASONABLE unless the Gold Summary clearly omits it and
the generated text already conveys the same outcome clearly.
C) Check time scope.
- If the subclaim could apply only to a specific time window (e.g., "no sequelae after initial event"),
infer whether the generated text covers that window. If the generated text describes later deterioration/death,
do NOT assume that supports "no sequelae." If the time scope is unclear, err toward UNREASONABLE.
D) Only mark REASONABLE if:
- The subclaim is NOT in the Gold Summary (or is clearly non-essential there), AND
- It is mainly anatomical/technical detail, jargon, or minor nuance for this literacy level, AND
- Omitting it does not change the clinical interpretation.
### Output ONLY JSON:
{{
"category": "reasonable" | "unreasonable",
"reason": "jargon_reduction" | "detail_filtering" | "clinical_info_loss",
"explanation": "One sentence justification referencing Gold Summary importance and (if relevant) time/outcome."
}}
JSON:"""
try:
response = client.chat.completions.create(
model=MODEL_PATH,
messages=[{"role": "user", "content": prompt}],
max_tokens=250,
temperature=0.1
)
content = response.choices[0].message.content.strip()
if "```json" in content:
content = content.split("```json")[-1].split("```")[0].strip()
return json.loads(content)
except:
return {"category": "unreasonable", "explanation": "API parsing error"}
# -----------------------------
# MAIN PROCESSING LOOP
# -----------------------------
def process_and_update_details():
# 1. Load Datasets
with open(EVAL_FILE, 'r') as f:
eval_data = json.load(f)
with open(RAW_DATA_FILE, 'r') as f:
raw_lookup = {item['index']: item for item in json.load(f)}
# 2. Iterate through index and literacy levels
for entry in tqdm.tqdm(eval_data, desc="Updating Subclaim Details"):
idx = entry['index']
raw_item = raw_lookup.get(idx)
if not raw_item: continue
source_text = raw_item['fulltext']
gold_summary = raw_item['summary']
for level, lvl_content in entry['literacy_levels'].items():
gen_text = raw_item['diff_label_texts'].get(level, "")
# --- UPDATE COMPLETENESS DETAILS ---
comp_list = lvl_content['details']['completeness']
comp_corrected = 0
for fact_obj in comp_list:
if fact_obj['status'] == 'not_supported':
res = get_clinical_reasoning(source=source_text, gold=gold_summary, generated=gen_text, subclaim=fact_obj['source_fact'], level=level)
# Update status and add reasoning metadata
if res['category'] == 'reasonable':
fact_obj['status'] = 'reasonable_omission'
comp_corrected += 1
fact_obj['reasoning_audit'] = res
else:
comp_corrected += 1
lvl_content['scores']['completeness'] = comp_corrected / len(comp_list) if comp_list else 0
# --- UPDATE SOURCE COVERAGE DETAILS ---
sc_list = lvl_content['details']['source_coverage']
sc_corrected = 0
for sc_obj in sc_list:
if sc_obj['status'] == 'not_supported':
res = get_clinical_reasoning(source=source_text, gold=gold_summary, generated=gen_text, subclaim=sc_obj['source_subclaim'], level=level)
# Update status and add reasoning metadata
if res['category'] == 'reasonable':
sc_obj['status'] = 'reasonable_omission'
sc_corrected += 1
sc_obj['reasoning_audit'] = res
else:
sc_corrected += 1
lvl_content['scores']['source_coverage'] = sc_corrected / len(sc_list) if sc_list else 0
# 3. Save the modified full structure
with open(UPDATED_FILE, 'w') as f:
json.dump(eval_data, f, indent=2)
print(f"\nUpdate complete. Detailed status and scores saved to: {UPDATED_FILE}")
if __name__ == "__main__":
process_and_update_details() |