readctrl / code /finetune-inference /old /completeness_reasoning_v3.py
shahidul034's picture
Add files using upload-large-folder tool
9c6961c verified
import json
import sys
from openai import OpenAI
import ast,os
# ===========================
# CONFIGURATION
# ===========================
MODEL_NAME = "/home/mshahidul/readctrl_model/full_model/qwen3-32B_subclaims-completeness_resonability_check_8kCtx_v3_BF16_merged"
VLLM_API_URL = "http://localhost:8004/v1"
VLLM_API_KEY = "EMPTY"
# Initialize Client
client = OpenAI(
base_url=VLLM_API_URL,
api_key=VLLM_API_KEY,
)
# ===========================
# INFERENCE FUNCTION
# ===========================
def infer_reasonableness(
reference_summary: str,
generated_summary: str,
readability_level: str,
subclaim_text: str,
result: int,
):
"""
Predict reasonableness using the local vLLM server.
No error handling: validation or connection errors will raise exceptions.
"""
# ---- Build inference prompt ----
prompt = f"""
You are an impartial medical summarization evaluator.
Goal:
Decide whether the inclusion or omission of ONE specific subclaim from the reference summary is *reasonable*, given the readability level of the generated summary.
Readability Criteria:
- Easy: for non-medical readers; emphasize main story and outcomes; omit numerical data, anatomy, and test details.
- Intermediate: for general educated readers; keep main findings but simplify phrasing.
- Hard: for clinical or technical readers; maintain diagnostic accuracy and essential quantitative or anatomic content.
Judging rules:
* Base your decision strictly on what appears in the generated summary.
* If result = 0 (subclaim omitted) and the omitted detail is clearly technical or numerical for the given level, choose "reasonable".
* If result = 0 and the subclaim is essential to the main story, choose "unreasonable".
* Stay consistent between `result`, justification, and readability level.
### Inputs
Readability Level: {readability_level}
Reference Summary: {reference_summary}
Generated Summary: {generated_summary}
Subclaim: "{subclaim_text}"
Result: {result} # 1 = supported (included), 0 = omitted
### Task
Respond **only** with the following JSON object:
{{
"reasonableness": "<reasonable | partially_reasonable | unreasonable>",
"justification": "<short clear explanation>"
}}
""".strip()
messages = [{"role": "user", "content": prompt}]
# ---- Call vLLM Server ----
response = client.chat.completions.create(
model=MODEL_NAME,
messages=messages,
temperature=0.2,
max_tokens=200,
top_p=0.8,
)
output_text = response.choices[0].message.content
# ---- Clean Output (Handle Thinking & Markdown) ----
try:
if "</think>" in output_text:
output_text = output_text.split("</think>")[1]
clean_text = output_text.strip().replace("```json", "").replace("```", "").strip()
# import ipdb; ipdb.set_trace()
t=ast.literal_eval(clean_text)
# ---- Parse JSON (Will raise JSONDecodeError if invalid) ----
return t
except Exception as e:
return output_text
# ===========================
# MAIN EXECUTION
# ===========================
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("--data_path", type=str, required=True,
help="Path to the JSON file containing evaluation data.")
args = parser.parse_args()
data_path = args.data_path
# data_path = '/home/mshahidul/readctrl/data/concise_complete_attr_cal_v3/evaluated_metrics_0_100.json'
file_name=os.path.basename(data_path)
# Open file directly (Will raise FileNotFoundError if missing)
with open(data_path, 'r') as f:
dataset = json.load(f)
# print(f"Loaded {len(dataset)} examples. Starting inference...")
save_path = f'/home/mshahidul/readctrl/data/completeness_resoning_result/{file_name}'
full_results = []
if os.path.exists(save_path):
with open(save_path, 'r') as f:
full_results = json.load(f)
import tqdm
for item in tqdm.tqdm(dataset):
if any(d['id'] == item['id'] for d in full_results):
continue
reference_summary = item['summary']
temp2={}
for label in ['easy', 'intermediate', 'hard']:
generated_summary = item[f'{label}_text']
subclaim_list = item['metrics'][f'{label}']['completeness']['details']
temp=[]
for idx, subclaim in enumerate(subclaim_list):
# Check status (assumes subclaim variable holds the status string)
result = 1 if subclaim['label'] == 'supported' else 0
if result ==0:
output = infer_reasonableness(
reference_summary=reference_summary,
generated_summary=generated_summary,
readability_level=label,
subclaim_text=subclaim['subclaim'],
result=result,
)
temp.append({
'subclaim': subclaim['subclaim'],
'output': output
})
else:
temp.append({
'subclaim': subclaim['subclaim'],
'output': {
'reasonableness': 'reasonable',
'justification': 'The subclaim is included in the generated summary, hence it is reasonable.'
}
})
temp2[label] = {
'results': temp
}
full_results.append({
'id': item['id'],
'completeness': temp2
})
if len(full_results) % 10 == 0:
with open(save_path, 'w') as f:
json.dump(full_results, f, indent=2, ensure_ascii=False)
with open(save_path, 'w') as f:
json.dump(full_results, f, indent=2, ensure_ascii=False)