MedLayEval / inference_example.py

Upload folder using huggingface_hub

67d9005 verified 8 days ago

4.01 kB

	"""Minimal one-sample MedLayEval inference example.

	Reproduces the call sequence used by `MedLayXPlain_official/MedLayEval/inference.py`
	without the batched leaderboard scaffolding.

	Run from this directory:

	python inference_example.py --image example.png

	If --image is omitted, a dummy white square is used so that the script
	verifies the full load-and-forward pipeline end-to-end without an image
	on disk.
	"""
	import argparse
	from pathlib import Path

	import torch
	from PIL import Image
	from peft import PeftModel
	from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration

	from model import VLMRegressor, ATTRS


	def parse_args():
	p = argparse.ArgumentParser()
	p.add_argument("--ckpt_dir", default=str(Path(__file__).parent),
	help="path containing adapter_*, regression_head.pt, processor files")
	p.add_argument("--base_model", default="Qwen/Qwen2.5-VL-3B-Instruct")
	p.add_argument("--image", default=None, help="optional local image path")
	p.add_argument("--expert", default=(
	"Axial chest CT showing a 1.2 cm spiculated nodule in the right "
	"upper lobe with adjacent pleural tethering, no mediastinal "
	"lymphadenopathy, and no pleural effusion."
	))
	p.add_argument("--lay", default=(
	"The scan shows a small spot in the upper part of the right lung "
	"with some pulling on the nearby lining. The lymph nodes in the "
	"middle of the chest look normal and there is no fluid build-up."
	))
	p.add_argument("--device", default="cuda:0" if torch.cuda.is_available() else "cpu")
	p.add_argument("--max_pixels", type=int, default=448 * 448)
	return p.parse_args()


	def load_model(ckpt_dir: str, base_model: str, device: str):
	processor = AutoProcessor.from_pretrained(base_model, max_pixels=448 * 448)
	if processor.tokenizer.pad_token is None:
	processor.tokenizer.pad_token = processor.tokenizer.eos_token

	base = Qwen2_5_VLForConditionalGeneration.from_pretrained(
	base_model, torch_dtype=torch.bfloat16, attn_implementation="sdpa",
	)
	vlm = PeftModel.from_pretrained(base, ckpt_dir)
	try:
	vlm = vlm.merge_and_unload()
	except Exception:
	pass

	hidden = (
	vlm.config.hidden_size
	if hasattr(vlm.config, "hidden_size")
	else vlm.config.text_config.hidden_size
	)
	model = VLMRegressor(vlm, hidden).to(device, dtype=torch.bfloat16)
	head_state = torch.load(f"{ckpt_dir}/regression_head.pt", map_location=device)
	model.head.load_state_dict(head_state)
	model.head = model.head.to(device, dtype=torch.float32)
	model.eval()
	return model, processor


	def score_triple(model, processor, image: Image.Image, expert: str, lay: str, device: str):
	user_text = f"<expert>{expert[:1500]}</expert>\n<lay>{lay[:1500]}</lay>"
	messages = [{"role": "user", "content": [
	{"type": "image"},
	{"type": "text", "text": user_text},
	]}]
	text = processor.apply_chat_template(
	messages, tokenize=False, add_generation_prompt=False,
	)
	inputs = processor(
	text=[text], images=[image],
	padding=True, truncation=True, max_length=2048,
	return_tensors="pt",
	).to(device)
	with torch.no_grad():
	preds = model(**inputs).cpu().float().numpy()[0]
	scores = {a: float(preds[k]) for k, a in enumerate(ATTRS)}
	scores["overall"] = round(sum(scores.values()) / len(ATTRS), 4)
	return scores


	def main():
	args = parse_args()
	model, processor = load_model(args.ckpt_dir, args.base_model, args.device)

	if args.image:
	image = Image.open(args.image).convert("RGB")
	else:
	image = Image.new("RGB", (224, 224), color=(255, 255, 255))
	print("[no --image; using a 224x224 white square]")

	scores = score_triple(model, processor, image, args.expert, args.lay, args.device)
	for k, v in scores.items():
	print(f" {k:>12s}: {v:.4f}")


	if __name__ == "__main__":
	main()