readctrl / code /reasoning /reasoning_completeness_sourceCov.py
shahidul034's picture
Add files using upload-large-folder tool
c7a6fe6 verified
import os
import json
import tqdm
from openai import OpenAI
# --- CONFIGURATION ---
MODEL_PATH = "Qwen/Qwen3-30B-A3B-Instruct-2507"
API_URL = "http://172.16.34.29:8004/v1"
API_KEY = "EMPTY"
# Input Files
EVAL_FILE = "/home/mshahidul/readctrl/data/reasoning/REFINED_full_details_evaluation_0_20_qwen3-32B_v2.json"
RAW_DATA_FILE = "/home/mshahidul/readctrl/data/extracting_subclaim/extracted_subclaims_syn_data_with_gs_summary_en.json"
# Output File
file_name=os.path.basename(EVAL_FILE)
UPDATED_FILE = f"/home/mshahidul/readctrl/data/reasoning/reasoned_updated_results_v2_{file_name}"
client = OpenAI(base_url=API_URL, api_key=API_KEY)
# -----------------------------
# REASONING CORE
# -----------------------------
def get_clinical_reasoning(source, gold, generated, subclaim, level):
# Map your specific label info to the prompt context
level_guidelines = {
"low_health_literacy": """
- Goal: 'Living room' language; replace jargon (e.g., 'renal' -> 'kidney').
- Density: Focus ONLY on 'need-to-know' info from Gold Summary.
- Strategy: One idea per sentence.
- Reasonable Omission: Technical jargon or details NOT in the Gold Summary.
""",
"intermediate_health_literacy": """
- Goal: Standard vocabulary; common medical terms are okay.
- Density: Gold Summary as lead + necessary Source Text context.
- Strategy: Remove minor technical details to avoid overload.
- Reasonable Omission: Minor technical nuances or physiological mechanisms.
""",
"proficient_health_literacy": """
- Goal: Technical/Academic language; prioritize clinical nuance.
- Density: High; include data, mechanisms, and statistics from Full Source.
- Strategy: Retain all original technical terminology.
- Reasonable Omission: Almost none; should adhere closely to Full Source.
"""
}
guideline = level_guidelines.get(level, "Follow standard medical summarization principles.")
# prompt = f"""You are a clinical logic validator auditing medical text simplification.
# A subclaim is currently 'not_supported' in the generated text.
# ### Target Level Guidelines: {level}
# {guideline}
# ### Inputs:
# 1. Source Text (Full Paper): {source}
# 2. Gold Summary (Expert Reference): {gold}
# 3. Generated Text (Model Output): {generated}
# 4. Subclaim to Evaluate: {subclaim}
# ### Task:
# Determine if the absence of this subclaim in the Generated Text is justified based on the {level} strategy.
# - CATEGORY 'reasonable': Omission aligns with the linguistic goals (e.g., removing jargon for Low literacy or filtering minor details for Intermediate).
# - CATEGORY 'unreasonable': Omission results in clinical information loss that violates the target density (e.g., missing a diagnosis or omitting technical data for Proficient level).
# Output ONLY JSON:
# {{
# "category": "reasonable" | "unreasonable",
# "reason": "jargon_reduction" | "detail_filtering" | "clinical_info_loss",
# "explanation": "One sentence justification matching the {level} strategy."
# }}
# JSON:"""
prompt = f"""You are a clinical logic validator auditing medical text simplification.
A subclaim is currently labeled 'not_supported' in the generated text. Your job is to decide whether
its omission is acceptable for the target literacy level.
### Target Level Guidelines: {level}
{guideline}
### Inputs:
1) Source Text (Full Paper): {source}
2) Gold Summary (Expert Reference): {gold}
3) Generated Text (Model Output): {generated}
4) Subclaim to Evaluate: {subclaim}
### Decision rules (MUST follow):
A) First, determine whether the subclaim is present in or required by the Gold Summary.
- If the Gold Summary includes this subclaim (or an equivalent idea), then omitting it is usually UNREASONABLE
even for low health literacy, because low literacy still must retain "need-to-know" gold content.
B) Check for outcome-critical content.
- If the subclaim is about outcomes/prognosis (e.g., recovery, no sequelae, disability, death, major complications),
treat it as clinically important. Omission is UNREASONABLE unless the Gold Summary clearly omits it and
the generated text already conveys the same outcome clearly.
C) Check time scope.
- If the subclaim could apply only to a specific time window (e.g., "no sequelae after initial event"),
infer whether the generated text covers that window. If the generated text describes later deterioration/death,
do NOT assume that supports "no sequelae." If the time scope is unclear, err toward UNREASONABLE.
D) Only mark REASONABLE if:
- The subclaim is NOT in the Gold Summary (or is clearly non-essential there), AND
- It is mainly anatomical/technical detail, jargon, or minor nuance for this literacy level, AND
- Omitting it does not change the clinical interpretation.
### Output ONLY JSON:
{{
"category": "reasonable" | "unreasonable",
"reason": "jargon_reduction" | "detail_filtering" | "clinical_info_loss",
"explanation": "One sentence justification referencing Gold Summary importance and (if relevant) time/outcome."
}}
JSON:"""
try:
response = client.chat.completions.create(
model=MODEL_PATH,
messages=[{"role": "user", "content": prompt}],
max_tokens=250,
temperature=0.1
)
content = response.choices[0].message.content.strip()
if "```json" in content:
content = content.split("```json")[-1].split("```")[0].strip()
return json.loads(content)
except:
return {"category": "unreasonable", "explanation": "API parsing error"}
# -----------------------------
# MAIN PROCESSING LOOP
# -----------------------------
def process_and_update_details():
# 1. Load Datasets
with open(EVAL_FILE, 'r') as f:
eval_data = json.load(f)
with open(RAW_DATA_FILE, 'r') as f:
raw_lookup = {item['index']: item for item in json.load(f)}
# 2. Iterate through index and literacy levels
for entry in tqdm.tqdm(eval_data, desc="Updating Subclaim Details"):
idx = entry['index']
raw_item = raw_lookup.get(idx)
if not raw_item: continue
source_text = raw_item['fulltext']
gold_summary = raw_item['summary']
for level, lvl_content in entry['literacy_levels'].items():
gen_text = raw_item['diff_label_texts'].get(level, "")
# --- UPDATE COMPLETENESS DETAILS ---
comp_list = lvl_content['details']['completeness']
comp_corrected = 0
for fact_obj in comp_list:
if fact_obj['status'] == 'not_supported':
res = get_clinical_reasoning(source=source_text, gold=gold_summary, generated=gen_text, subclaim=fact_obj['source_fact'], level=level)
# Update status and add reasoning metadata
if res['category'] == 'reasonable':
fact_obj['status'] = 'reasonable_omission'
comp_corrected += 1
fact_obj['reasoning_audit'] = res
else:
comp_corrected += 1
lvl_content['scores']['completeness'] = comp_corrected / len(comp_list) if comp_list else 0
# --- UPDATE SOURCE COVERAGE DETAILS ---
sc_list = lvl_content['details']['source_coverage']
sc_corrected = 0
for sc_obj in sc_list:
if sc_obj['status'] == 'not_supported':
res = get_clinical_reasoning(source=source_text, gold=gold_summary, generated=gen_text, subclaim=sc_obj['source_subclaim'], level=level)
# Update status and add reasoning metadata
if res['category'] == 'reasonable':
sc_obj['status'] = 'reasonable_omission'
sc_corrected += 1
sc_obj['reasoning_audit'] = res
else:
sc_corrected += 1
lvl_content['scores']['source_coverage'] = sc_corrected / len(sc_list) if sc_list else 0
# 3. Save the modified full structure
with open(UPDATED_FILE, 'w') as f:
json.dump(eval_data, f, indent=2)
print(f"\nUpdate complete. Detailed status and scores saved to: {UPDATED_FILE}")
if __name__ == "__main__":
process_and_update_details()