File size: 6,196 Bytes
9c6961c | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 | import json
import sys
from openai import OpenAI
import ast,os
# ===========================
# CONFIGURATION
# ===========================
MODEL_NAME = "/home/mshahidul/readctrl_model/full_model/qwen3-32B_subclaims-completeness_resonability_check_8kCtx_v3_BF16_merged"
VLLM_API_URL = "http://localhost:8004/v1"
VLLM_API_KEY = "EMPTY"
# Initialize Client
client = OpenAI(
base_url=VLLM_API_URL,
api_key=VLLM_API_KEY,
)
# ===========================
# INFERENCE FUNCTION
# ===========================
def infer_reasonableness(
reference_summary: str,
generated_summary: str,
readability_level: str,
subclaim_text: str,
result: int,
):
"""
Predict reasonableness using the local vLLM server.
No error handling: validation or connection errors will raise exceptions.
"""
# ---- Build inference prompt ----
prompt = f"""
You are an impartial medical summarization evaluator.
Goal:
Decide whether the inclusion or omission of ONE specific subclaim from the reference summary is *reasonable*, given the readability level of the generated summary.
Readability Criteria:
- Easy: for non-medical readers; emphasize main story and outcomes; omit numerical data, anatomy, and test details.
- Intermediate: for general educated readers; keep main findings but simplify phrasing.
- Hard: for clinical or technical readers; maintain diagnostic accuracy and essential quantitative or anatomic content.
Judging rules:
* Base your decision strictly on what appears in the generated summary.
* If result = 0 (subclaim omitted) and the omitted detail is clearly technical or numerical for the given level, choose "reasonable".
* If result = 0 and the subclaim is essential to the main story, choose "unreasonable".
* Stay consistent between `result`, justification, and readability level.
### Inputs
Readability Level: {readability_level}
Reference Summary: {reference_summary}
Generated Summary: {generated_summary}
Subclaim: "{subclaim_text}"
Result: {result} # 1 = supported (included), 0 = omitted
### Task
Respond **only** with the following JSON object:
{{
"reasonableness": "<reasonable | partially_reasonable | unreasonable>",
"justification": "<short clear explanation>"
}}
""".strip()
messages = [{"role": "user", "content": prompt}]
# ---- Call vLLM Server ----
response = client.chat.completions.create(
model=MODEL_NAME,
messages=messages,
temperature=0.2,
max_tokens=200,
top_p=0.8,
)
output_text = response.choices[0].message.content
# ---- Clean Output (Handle Thinking & Markdown) ----
try:
if "</think>" in output_text:
output_text = output_text.split("</think>")[1]
clean_text = output_text.strip().replace("```json", "").replace("```", "").strip()
# import ipdb; ipdb.set_trace()
t=ast.literal_eval(clean_text)
# ---- Parse JSON (Will raise JSONDecodeError if invalid) ----
return t
except Exception as e:
return output_text
# ===========================
# MAIN EXECUTION
# ===========================
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("--data_path", type=str, required=True,
help="Path to the JSON file containing evaluation data.")
args = parser.parse_args()
data_path = args.data_path
# data_path = '/home/mshahidul/readctrl/data/concise_complete_attr_cal_v3/evaluated_metrics_0_100.json'
file_name=os.path.basename(data_path)
# Open file directly (Will raise FileNotFoundError if missing)
with open(data_path, 'r') as f:
dataset = json.load(f)
# print(f"Loaded {len(dataset)} examples. Starting inference...")
save_path = f'/home/mshahidul/readctrl/data/completeness_resoning_result/{file_name}'
full_results = []
if os.path.exists(save_path):
with open(save_path, 'r') as f:
full_results = json.load(f)
import tqdm
for item in tqdm.tqdm(dataset):
if any(d['id'] == item['id'] for d in full_results):
continue
reference_summary = item['summary']
temp2={}
for label in ['easy', 'intermediate', 'hard']:
generated_summary = item[f'{label}_text']
subclaim_list = item['metrics'][f'{label}']['completeness']['details']
temp=[]
for idx, subclaim in enumerate(subclaim_list):
# Check status (assumes subclaim variable holds the status string)
result = 1 if subclaim['label'] == 'supported' else 0
if result ==0:
output = infer_reasonableness(
reference_summary=reference_summary,
generated_summary=generated_summary,
readability_level=label,
subclaim_text=subclaim['subclaim'],
result=result,
)
temp.append({
'subclaim': subclaim['subclaim'],
'output': output
})
else:
temp.append({
'subclaim': subclaim['subclaim'],
'output': {
'reasonableness': 'reasonable',
'justification': 'The subclaim is included in the generated summary, hence it is reasonable.'
}
})
temp2[label] = {
'results': temp
}
full_results.append({
'id': item['id'],
'completeness': temp2
})
if len(full_results) % 10 == 0:
with open(save_path, 'w') as f:
json.dump(full_results, f, indent=2, ensure_ascii=False)
with open(save_path, 'w') as f:
json.dump(full_results, f, indent=2, ensure_ascii=False)
|