eval-pack / eval /eval_qwen_vl.py

Upload folder using huggingface_hub

2e7f2ce verified about 1 month ago

11.8 kB

	"""
	Qwen2.5-VL-3B Evaluation Script
	Evaluates the original Qwen2.5-VL-3B-Instruct (with vision) on held-out caption data.
	Also supports evaluating LoRA / block-circulant finetuned versions if checkpoints exist.

	Usage:
	# Original model
	python eval/eval_qwen_vl.py --mode all \
	--model-path Finetune-Qwen2.5-VL/Qwen2.5-VL-3B-Instruct \
	--eval-data data_dir/VoRA-Recap-29M/eval_qwenvl.jsonl

	# With LoRA adapter
	python eval/eval_qwen_vl.py --mode all \
	--model-path Finetune-Qwen2.5-VL/Qwen2.5-VL-3B-Instruct \
	--adapter-path Finetune-Qwen2.5-VL/saves/Qwen2.5-VL-3B-Instruct/lora \
	--eval-data data_dir/VoRA-Recap-29M/eval_qwenvl.jsonl
	"""

	import argparse
	import json
	import math
	import os
	import sys

	import torch
	from PIL import Image
	from tqdm import tqdm
	from transformers import (
	AutoModelForCausalLM,
	AutoProcessor,
	AutoTokenizer,
	Qwen2VLForConditionalGeneration,
	)

	IGNORE_INDEX = -100


	# ============================================================
	# Data loading
	# ============================================================

	def load_eval_data(eval_path, max_samples=None):
	data = []
	with open(eval_path, "r") as f:
	for line in f:
	item = json.loads(line.strip())
	data.append(item)
	if max_samples and len(data) >= max_samples:
	break
	print(f"Loaded {len(data)} evaluation samples")
	return data


	# ============================================================
	# Build inputs for Qwen2.5-VL
	# ============================================================

	def build_messages(image_path, caption=None):
	"""Build Qwen2.5-VL chat messages for image captioning."""
	messages = [
	{
	"role": "system",
	"content": [{"type": "text", "text": "You are a helpful assistant."}],
	},
	{
	"role": "user",
	"content": [
	{"type": "image", "image": f"file://{os.path.abspath(image_path)}"},
	{"type": "text", "text": "Describe this image."},
	],
	},
	]
	if caption is not None:
	# For perplexity: add assistant response
	messages.append({
	"role": "assistant",
	"content": [{"type": "text", "text": caption}],
	})
	return messages


	def prepare_perplexity_inputs(processor, image_path, caption, device):
	"""Prepare inputs for perplexity evaluation (with labels)."""
	# Full messages with the ground truth caption as assistant response
	messages_full = build_messages(image_path, caption=caption)
	text_full = processor.apply_chat_template(
	messages_full, tokenize=False, add_generation_prompt=False)

	# Prompt-only (no assistant response) to find where caption starts
	messages_prompt = build_messages(image_path, caption=None)
	text_prompt = processor.apply_chat_template(
	messages_prompt, tokenize=False, add_generation_prompt=True)

	# Process full input with image
	image = Image.open(image_path).convert("RGB")
	inputs_full = processor(
	text=[text_full], images=[image], padding=True, return_tensors="pt"
	).to(device)
	inputs_prompt = processor(
	text=[text_prompt], images=[image], padding=True, return_tensors="pt"
	).to(device)

	# Create labels: mask out prompt tokens
	input_ids = inputs_full["input_ids"]
	prompt_len = inputs_prompt["input_ids"].shape[1]
	labels = input_ids.clone()
	labels[:, :prompt_len] = IGNORE_INDEX

	n_caption_tokens = int((labels != IGNORE_INDEX).sum().item())
	inputs_full["labels"] = labels

	return inputs_full, n_caption_tokens


	def prepare_generation_inputs(processor, image_path, device):
	"""Prepare inputs for caption generation."""
	messages = build_messages(image_path, caption=None)
	text = processor.apply_chat_template(
	messages, tokenize=False, add_generation_prompt=True)

	image = Image.open(image_path).convert("RGB")
	inputs = processor(
	text=[text], images=[image], padding=True, return_tensors="pt"
	).to(device)
	return inputs


	# ============================================================
	# Evaluation: Perplexity
	# ============================================================

	@torch.no_grad()
	def evaluate_perplexity(model, processor, eval_data, device):
	model.eval()
	total_loss = 0.0
	total_tokens = 0
	errors = 0

	for i, item in enumerate(tqdm(eval_data, desc="Qwen-VL Perplexity")):
	image_path = item["image"]
	caption = item["text"]

	if not os.path.exists(image_path):
	errors += 1
	continue

	try:
	inputs, n_tokens = prepare_perplexity_inputs(
	processor, image_path, caption, device)
	outputs = model(**inputs)
	loss = outputs.loss
	total_loss += loss.item() * n_tokens
	total_tokens += n_tokens
	except Exception as e:
	errors += 1
	if errors <= 5:
	print(f" Error on sample {i}: {e}")
	continue

	if total_tokens == 0:
	print("No valid samples!")
	return float("inf")

	avg_loss = total_loss / total_tokens
	perplexity = math.exp(avg_loss)
	print(f"\n=== Qwen2.5-VL Perplexity Results ===")
	print(f"Samples: {len(eval_data) - errors}/{len(eval_data)}")
	print(f"Errors: {errors}")
	print(f"Average CE loss: {avg_loss:.4f}")
	print(f"Perplexity: {perplexity:.2f}")
	return perplexity


	# ============================================================
	# Evaluation: Caption Generation
	# ============================================================

	@torch.no_grad()
	def evaluate_caption(model, processor, eval_data, device, max_new_tokens=256):
	model.eval()
	predictions = []
	references = []
	errors = 0

	for i, item in enumerate(tqdm(eval_data, desc="Qwen-VL Caption")):
	image_path = item["image"]
	caption = item["text"]

	if not os.path.exists(image_path):
	errors += 1
	continue

	try:
	inputs = prepare_generation_inputs(processor, image_path, device)
	prompt_len = inputs["input_ids"].shape[1]

	outputs = model.generate(
	**inputs,
	max_new_tokens=max_new_tokens,
	do_sample=False,
	)

	generated = outputs[0][prompt_len:]
	text = processor.tokenizer.decode(generated, skip_special_tokens=True)
	predictions.append(text)
	references.append(caption)
	except Exception as e:
	errors += 1
	if errors <= 5:
	print(f" Error on sample {i}: {e}")
	continue

	if not predictions:
	print("No valid samples!")
	return {}

	metrics = _compute_metrics(predictions, references)
	print(f"\n=== Qwen2.5-VL Caption Results ===")
	print(f"Samples: {len(predictions)}/{len(eval_data)}")
	print(f"Errors: {errors}")
	for k, v in metrics.items():
	print(f"{k}: {v:.4f}")

	print(f"\n--- Sample Outputs (first 5) ---")
	for i in range(min(5, len(predictions))):
	print(f"[{i}] Generated: {predictions[i][:200]}")
	print(f"[{i}] Reference: {references[i][:200]}")
	print()

	return metrics


	def _compute_metrics(predictions, references):
	metrics = {}
	try:
	from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
	smooth = SmoothingFunction().method1
	refs = [[ref.split()] for ref in references]
	preds = [pred.split() for pred in predictions]
	metrics["BLEU-1"] = corpus_bleu(refs, preds, weights=(1, 0, 0, 0), smoothing_function=smooth)
	metrics["BLEU-4"] = corpus_bleu(refs, preds, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smooth)
	except ImportError:
	print("Warning: nltk not installed. pip install nltk")
	try:
	from rouge_score import rouge_scorer
	scorer = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=True)
	scores = [scorer.score(ref, pred)["rougeL"].fmeasure for pred, ref in zip(predictions, references)]
	metrics["ROUGE-L"] = sum(scores) / len(scores)
	except ImportError:
	print("Warning: rouge-score not installed. pip install rouge-score")
	return metrics


	# ============================================================
	# Model loading
	# ============================================================

	def load_model(model_path, adapter_path=None, dtype=torch.float16):
	print(f"Loading Qwen2.5-VL from {model_path} ...")
	processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)

	# Try Qwen2VL-specific class first, fall back to AutoModel
	try:
	model = Qwen2VLForConditionalGeneration.from_pretrained(
	model_path,
	torch_dtype=dtype,
	device_map="auto",
	trust_remote_code=True,
	)
	except Exception:
	model = AutoModelForCausalLM.from_pretrained(
	model_path,
	torch_dtype=dtype,
	device_map="auto",
	trust_remote_code=True,
	)

	# Load LoRA adapter if provided
	if adapter_path and os.path.exists(adapter_path):
	print(f"Loading adapter from {adapter_path} ...")
	from peft import PeftModel
	model = PeftModel.from_pretrained(model, adapter_path)
	model = model.merge_and_unload()
	print("Adapter merged.")

	model.eval()
	device = next(model.parameters()).device
	print(f"Model loaded on {device}")
	return model, processor


	# ============================================================
	# Main
	# ============================================================

	def main():
	parser = argparse.ArgumentParser(description="Qwen2.5-VL-3B Evaluation")
	parser.add_argument("--mode", type=str, default="all",
	choices=["perplexity", "caption", "all"])
	parser.add_argument("--model-path", type=str, required=True,
	help="Path to Qwen2.5-VL-3B-Instruct")
	parser.add_argument("--adapter-path", type=str, default=None,
	help="Path to LoRA/circulant adapter (optional)")
	parser.add_argument("--eval-data", type=str, required=True,
	help="Path to eval_qwenvl.jsonl")
	parser.add_argument("--max-samples", type=int, default=None)
	parser.add_argument("--max-new-tokens", type=int, default=256)
	parser.add_argument("--dtype", type=str, default="float16",
	choices=["float16", "bfloat16"])
	parser.add_argument("--output", type=str, default=None)
	args = parser.parse_args()

	dtype = torch.float16 if args.dtype == "float16" else torch.bfloat16
	model, processor = load_model(args.model_path, args.adapter_path, dtype)
	device = next(model.parameters()).device

	eval_data = load_eval_data(args.eval_data, max_samples=args.max_samples)

	model_name = "Qwen2.5-VL-3B"
	if args.adapter_path:
	model_name += f" + {os.path.basename(args.adapter_path)}"
	results = {"model": model_name, "num_samples": len(eval_data)}

	if args.mode in ("perplexity", "all"):
	ppl = evaluate_perplexity(model, processor, eval_data, device)
	results["perplexity"] = ppl

	if args.mode in ("caption", "all"):
	metrics = evaluate_caption(
	model, processor, eval_data, device, max_new_tokens=args.max_new_tokens)
	results.update(metrics)

	if args.output:
	os.makedirs(os.path.dirname(args.output) or ".", exist_ok=True)
	with open(args.output, "w") as f:
	json.dump(results, f, indent=2, ensure_ascii=False)
	print(f"\nResults saved to {args.output}")


	if __name__ == "__main__":
	main()