readctrl / code /finetune-inference /old /completeness_reasoning_v3.py

Add files using upload-large-folder tool

9c6961c verified 28 days ago

6.2 kB

	import json
	import sys
	from openai import OpenAI
	import ast,os
	# ===========================
	# CONFIGURATION
	# ===========================
	MODEL_NAME = "/home/mshahidul/readctrl_model/full_model/qwen3-32B_subclaims-completeness_resonability_check_8kCtx_v3_BF16_merged"
	VLLM_API_URL = "http://localhost:8004/v1"
	VLLM_API_KEY = "EMPTY"

	# Initialize Client
	client = OpenAI(
	base_url=VLLM_API_URL,
	api_key=VLLM_API_KEY,
	)

	# ===========================
	# INFERENCE FUNCTION
	# ===========================
	def infer_reasonableness(
	reference_summary: str,
	generated_summary: str,
	readability_level: str,
	subclaim_text: str,
	result: int,
	):
	"""
	Predict reasonableness using the local vLLM server.
	No error handling: validation or connection errors will raise exceptions.
	"""

	# ---- Build inference prompt ----
	prompt = f"""
	You are an impartial medical summarization evaluator.

	Goal:
	Decide whether the inclusion or omission of ONE specific subclaim from the reference summary is reasonable, given the readability level of the generated summary.

	Readability Criteria:
	- Easy: for non-medical readers; emphasize main story and outcomes; omit numerical data, anatomy, and test details.
	- Intermediate: for general educated readers; keep main findings but simplify phrasing.
	- Hard: for clinical or technical readers; maintain diagnostic accuracy and essential quantitative or anatomic content.

	Judging rules:
	* Base your decision strictly on what appears in the generated summary.
	* If result = 0 (subclaim omitted) and the omitted detail is clearly technical or numerical for the given level, choose "reasonable".
	* If result = 0 and the subclaim is essential to the main story, choose "unreasonable".
	* Stay consistent between `result`, justification, and readability level.

	### Inputs
	Readability Level: {readability_level}
	Reference Summary: {reference_summary}
	Generated Summary: {generated_summary}
	Subclaim: "{subclaim_text}"
	Result: {result} # 1 = supported (included), 0 = omitted

	### Task
	Respond only with the following JSON object:

	{{
	"reasonableness": "<reasonable \| partially_reasonable \| unreasonable>",
	"justification": "<short clear explanation>"
	}}
	""".strip()

	messages = [{"role": "user", "content": prompt}]

	# ---- Call vLLM Server ----
	response = client.chat.completions.create(
	model=MODEL_NAME,
	messages=messages,
	temperature=0.2,
	max_tokens=200,
	top_p=0.8,
	)

	output_text = response.choices[0].message.content

	# ---- Clean Output (Handle Thinking & Markdown) ----
	try:
	if "</think>" in output_text:
	output_text = output_text.split("</think>")[1]

	clean_text = output_text.strip().replace("```json", "").replace("```", "").strip()
	# import ipdb; ipdb.set_trace()
	t=ast.literal_eval(clean_text)

	# ---- Parse JSON (Will raise JSONDecodeError if invalid) ----
	return t
	except Exception as e:
	return output_text


	# ===========================
	# MAIN EXECUTION
	# ===========================
	if __name__ == "__main__":
	import argparse
	parser = argparse.ArgumentParser()
	parser.add_argument("--data_path", type=str, required=True,
	help="Path to the JSON file containing evaluation data.")
	args = parser.parse_args()
	data_path = args.data_path
	# data_path = '/home/mshahidul/readctrl/data/concise_complete_attr_cal_v3/evaluated_metrics_0_100.json'
	file_name=os.path.basename(data_path)

	# Open file directly (Will raise FileNotFoundError if missing)
	with open(data_path, 'r') as f:
	dataset = json.load(f)

	# print(f"Loaded {len(dataset)} examples. Starting inference...")
	save_path = f'/home/mshahidul/readctrl/data/completeness_resoning_result/{file_name}'
	full_results = []
	if os.path.exists(save_path):
	with open(save_path, 'r') as f:
	full_results = json.load(f)

	import tqdm
	for item in tqdm.tqdm(dataset):
	if any(d['id'] == item['id'] for d in full_results):
	continue
	reference_summary = item['summary']
	temp2={}
	for label in ['easy', 'intermediate', 'hard']:
	generated_summary = item[f'{label}_text']
	subclaim_list = item['metrics'][f'{label}']['completeness']['details']
	temp=[]
	for idx, subclaim in enumerate(subclaim_list):

	# Check status (assumes subclaim variable holds the status string)
	result = 1 if subclaim['label'] == 'supported' else 0

	if result ==0:
	output = infer_reasonableness(
	reference_summary=reference_summary,
	generated_summary=generated_summary,
	readability_level=label,
	subclaim_text=subclaim['subclaim'],
	result=result,
	)

	temp.append({
	'subclaim': subclaim['subclaim'],
	'output': output
	})
	else:
	temp.append({
	'subclaim': subclaim['subclaim'],
	'output': {
	'reasonableness': 'reasonable',
	'justification': 'The subclaim is included in the generated summary, hence it is reasonable.'
	}
	})

	temp2[label] = {
	'results': temp
	}
	full_results.append({
	'id': item['id'],
	'completeness': temp2
	})
	if len(full_results) % 10 == 0:
	with open(save_path, 'w') as f:
	json.dump(full_results, f, indent=2, ensure_ascii=False)

	with open(save_path, 'w') as f:
	json.dump(full_results, f, indent=2, ensure_ascii=False)