readCtrl_lambda / code /attribution_evalV2.py

mshahidul

Initial commit of readCtrl code without large models

030876e 6 days ago

6.98 kB

	import os
	os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
	os.environ["CUDA_VISIBLE_DEVICES"] = "4"

	import json
	import torch
	from unsloth import FastLanguageModel
	import tqdm


	_model_cache = {"model": None, "tokenizer": None}

	def load_finetuned_model(model_path: str):
	"""Load and cache the fine-tuned model + tokenizer."""
	if _model_cache["model"] is not None:
	return _model_cache["model"], _model_cache["tokenizer"]

	model, tokenizer = FastLanguageModel.from_pretrained(
	model_name=model_path,
	max_seq_length=8192,
	load_in_4bit=False,
	load_in_8bit=False,
	full_finetuning=False,
	)
	_model_cache["model"], _model_cache["tokenizer"] = model, tokenizer
	return model, tokenizer


	def build_inference_prompt(
	reference_full_text,
	generated_summary,
	subclaim_id,
	subclaim_text,
	subclaim_result,
	difficulty_level
	):
	"""
	Build a standardized inference prompt for single‑subclaim evaluation.
	Use after fine‑tuning to assess new examples consistently.
	"""

	inference_prompt = f"""
	### SYSTEM / ROLE INSTRUCTION

	You are a medical factuality and attribution evaluator.
	You will analyze one subclaim from a generated medical summary.

	Each subclaim includes a `"result"` flag:
	- `1` → Supported by the reference text (no reasonableness check required)
	- `0` → Unsupported by the reference text (evaluate scope and validity)

	Your task is to decide, for unsupported subclaims, whether the new information
	is a reasonable addition given the specified readability level:
	easy, intermediate, or hard.

	---

	### READABILITY GUIDELINES

	\| Level \| Audience \| Style \| Allowable Additions \|
	\| :-- \| :-- \| :-- \| :-- \|
	\| Easy (FH 70–100) \| General public \| Simple, concrete \| Broad clarifications only; no factual innovations \|
	\| Intermediate (FH 50–69) \| Educated nonspecialist \| Moderate precision \| Limited clarifications consistent with the text \|
	\| Hard (FH 0–49) \| Professionals \| Formal, technical \| Must be strictly supported by evidence \|

	---

	### INPUT

	Readability Level: {difficulty_level}

	Reference Full Text:
	{reference_full_text}

	Generated Summary:
	{generated_summary}

	Subclaim Info:
	{{
	"subclaim_id": {subclaim_id},
	"subclaim": "{subclaim_text}",
	"result": {subclaim_result}
	}}

	---

	### TASK INSTRUCTIONS

	- If `"result": 1"`, respond with `"not_applicable"` and justify briefly
	(e.g., "supported, no evaluation required").
	- If `"result": 0"`, classify reasonableness:
	- `"reasonable"` → legitimate simplification consistent with the readability level
	- `"partially_reasonable"` → benign rephrasing
	- `"unreasonable"` → misleading, speculative, or contradicted by the source

	Provide a short 1–2 sentence justification.

	---

	### EXPECTED OUTPUT (JSON ONLY)

	```json
	{{
	"evaluation": {{
	"subclaim_id": {subclaim_id},
	"subclaim": "{subclaim_text}",
	"result": {subclaim_result},
	"reasonableness": "<reasonable \| partially_reasonable \| unreasonable \| not_applicable>",
	"justification": "<brief justification>"
	}}
	}}
	""".strip()

	return inference_prompt
	def infer_attribution_reasonableness(prompt: str, model_path: str):
	"""Run inference using the fine-tuned model with attribution prompt."""
	model, tokenizer = load_finetuned_model(model_path)

	messages = [{"role": "user", "content": prompt + "\n"}]

	chat_text = tokenizer.apply_chat_template(
	messages,
	tokenize=False,
	add_generation_prompt=True,
	enable_thinking=False,
	)

	inputs = tokenizer(chat_text, return_tensors="pt").to("cuda")

	with torch.no_grad():
	output_ids = model.generate(
	**inputs,
	max_new_tokens=150,
	temperature=0.2,
	top_p=0.8,
	top_k=5,
	do_sample=False,
	)

	output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True).strip()
	if "</think>" in output_text:
	output_text = output_text.split("</think>")[-1].strip().replace("```json", "").replace("```", "")

	try:
	parsed = json.loads(output_text)
	except Exception:
	parsed = output_text
	return parsed


	file_synth = "/home/mshahidul/readctrl/data/training_data_subclaim_verifier/synthetic_data_es_subclaims_100.json"
	file_qwen_results = "/home/mshahidul/readctrl/results/dataset_quality_check/subclaim_verifier_results_100_qwen3-32B.json"
	save_path = "/home/mshahidul/readctrl/results/dataset_quality_check/attribution_resonability_results_100_qwen3-32B_v2.json"

	with open(file_synth, 'r') as f:
	synthetic_data = json.load(f)
	with open(file_qwen_results, 'r') as f:
	qwen3_32B_results = json.load(f)
	dict1={}
	for item in qwen3_32B_results:
	version=item['version']
	dict1[(item['id'], version)] = item['attribution']['results']

	res = []
	if os.path.exists(save_path):
	with open(save_path, 'r') as f:
	res = json.load(f)
	print(f"🔁 Resuming from {len(res)} entries")

	existing = set((e["id"], e["difficulty_level"]) for e in res)

	for ind in tqdm.tqdm(range(0, 100)):
	entry = synthetic_data[ind]

	for level in ["easy", "intermediate", "hard"]:
	subclaims_results = dict1[(entry["id"], level)]
	if (entry["id"], level) in existing:
	print(f"⏭️ Skipping {entry['id']} ({level})")
	continue

	ref_full_text = entry["full_text"]
	generated_summary = entry["readability_versions"][level]["text"]
	temp=[]
	for subclaim in subclaims_results:
	subclaim_id = subclaim['subclaim']['id']
	subclaim_text = subclaim['subclaim']['subclaim']
	subclaim_result = subclaim['result']
	prompt = build_inference_prompt(
	ref_full_text,
	generated_summary,
	subclaim_id,
	subclaim_text,
	subclaim_result,
	level
	)
	if subclaim_result=="1":
	temp.append({
	"subclaim_id": subclaim_id,
	"subclaim_text": subclaim_text,
	"response": "not_applicable"
	})
	continue
	response = infer_attribution_reasonableness(prompt,"/home/mshahidul/readctrl_model/qwen3-32B_subclaims-attribution_resonability_check_8kCtx_v1")
	temp.append({
	"subclaim_id": subclaim_id,
	"subclaim_text": subclaim_text,
	"response": response
	})
	res.append({
	"id": entry["id"],
	"difficulty_level": level,
	"results": temp
	})
	if len(res) % 10 == 0:
	with open(save_path, 'w') as f:
	json.dump(res, f, indent=2, ensure_ascii=False)
	print(f"💾 Saved after {len(res)} entries")

	with open(save_path, 'w') as f:
	json.dump(res, f, indent=2, ensure_ascii=False)