""" Qwen2.5-VL-3B Evaluation Script Evaluates the original Qwen2.5-VL-3B-Instruct (with vision) on held-out caption data. Also supports evaluating LoRA / block-circulant finetuned versions if checkpoints exist. Usage: # Original model python eval/eval_qwen_vl.py --mode all \ --model-path Finetune-Qwen2.5-VL/Qwen2.5-VL-3B-Instruct \ --eval-data data_dir/VoRA-Recap-29M/eval_qwenvl.jsonl # With LoRA adapter python eval/eval_qwen_vl.py --mode all \ --model-path Finetune-Qwen2.5-VL/Qwen2.5-VL-3B-Instruct \ --adapter-path Finetune-Qwen2.5-VL/saves/Qwen2.5-VL-3B-Instruct/lora \ --eval-data data_dir/VoRA-Recap-29M/eval_qwenvl.jsonl """ import argparse import json import math import os import sys import torch from PIL import Image from tqdm import tqdm from transformers import ( AutoModelForCausalLM, AutoProcessor, AutoTokenizer, Qwen2VLForConditionalGeneration, ) IGNORE_INDEX = -100 # ============================================================ # Data loading # ============================================================ def load_eval_data(eval_path, max_samples=None): data = [] with open(eval_path, "r") as f: for line in f: item = json.loads(line.strip()) data.append(item) if max_samples and len(data) >= max_samples: break print(f"Loaded {len(data)} evaluation samples") return data # ============================================================ # Build inputs for Qwen2.5-VL # ============================================================ def build_messages(image_path, caption=None): """Build Qwen2.5-VL chat messages for image captioning.""" messages = [ { "role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}], }, { "role": "user", "content": [ {"type": "image", "image": f"file://{os.path.abspath(image_path)}"}, {"type": "text", "text": "Describe this image."}, ], }, ] if caption is not None: # For perplexity: add assistant response messages.append({ "role": "assistant", "content": [{"type": "text", "text": caption}], }) return messages def prepare_perplexity_inputs(processor, image_path, caption, device): """Prepare inputs for perplexity evaluation (with labels).""" # Full messages with the ground truth caption as assistant response messages_full = build_messages(image_path, caption=caption) text_full = processor.apply_chat_template( messages_full, tokenize=False, add_generation_prompt=False) # Prompt-only (no assistant response) to find where caption starts messages_prompt = build_messages(image_path, caption=None) text_prompt = processor.apply_chat_template( messages_prompt, tokenize=False, add_generation_prompt=True) # Process full input with image image = Image.open(image_path).convert("RGB") inputs_full = processor( text=[text_full], images=[image], padding=True, return_tensors="pt" ).to(device) inputs_prompt = processor( text=[text_prompt], images=[image], padding=True, return_tensors="pt" ).to(device) # Create labels: mask out prompt tokens input_ids = inputs_full["input_ids"] prompt_len = inputs_prompt["input_ids"].shape[1] labels = input_ids.clone() labels[:, :prompt_len] = IGNORE_INDEX n_caption_tokens = int((labels != IGNORE_INDEX).sum().item()) inputs_full["labels"] = labels return inputs_full, n_caption_tokens def prepare_generation_inputs(processor, image_path, device): """Prepare inputs for caption generation.""" messages = build_messages(image_path, caption=None) text = processor.apply_chat_template( messages, tokenize=False, add_generation_prompt=True) image = Image.open(image_path).convert("RGB") inputs = processor( text=[text], images=[image], padding=True, return_tensors="pt" ).to(device) return inputs # ============================================================ # Evaluation: Perplexity # ============================================================ @torch.no_grad() def evaluate_perplexity(model, processor, eval_data, device): model.eval() total_loss = 0.0 total_tokens = 0 errors = 0 for i, item in enumerate(tqdm(eval_data, desc="Qwen-VL Perplexity")): image_path = item["image"] caption = item["text"] if not os.path.exists(image_path): errors += 1 continue try: inputs, n_tokens = prepare_perplexity_inputs( processor, image_path, caption, device) outputs = model(**inputs) loss = outputs.loss total_loss += loss.item() * n_tokens total_tokens += n_tokens except Exception as e: errors += 1 if errors <= 5: print(f" Error on sample {i}: {e}") continue if total_tokens == 0: print("No valid samples!") return float("inf") avg_loss = total_loss / total_tokens perplexity = math.exp(avg_loss) print(f"\n=== Qwen2.5-VL Perplexity Results ===") print(f"Samples: {len(eval_data) - errors}/{len(eval_data)}") print(f"Errors: {errors}") print(f"Average CE loss: {avg_loss:.4f}") print(f"Perplexity: {perplexity:.2f}") return perplexity # ============================================================ # Evaluation: Caption Generation # ============================================================ @torch.no_grad() def evaluate_caption(model, processor, eval_data, device, max_new_tokens=256): model.eval() predictions = [] references = [] errors = 0 for i, item in enumerate(tqdm(eval_data, desc="Qwen-VL Caption")): image_path = item["image"] caption = item["text"] if not os.path.exists(image_path): errors += 1 continue try: inputs = prepare_generation_inputs(processor, image_path, device) prompt_len = inputs["input_ids"].shape[1] outputs = model.generate( **inputs, max_new_tokens=max_new_tokens, do_sample=False, ) generated = outputs[0][prompt_len:] text = processor.tokenizer.decode(generated, skip_special_tokens=True) predictions.append(text) references.append(caption) except Exception as e: errors += 1 if errors <= 5: print(f" Error on sample {i}: {e}") continue if not predictions: print("No valid samples!") return {} metrics = _compute_metrics(predictions, references) print(f"\n=== Qwen2.5-VL Caption Results ===") print(f"Samples: {len(predictions)}/{len(eval_data)}") print(f"Errors: {errors}") for k, v in metrics.items(): print(f"{k}: {v:.4f}") print(f"\n--- Sample Outputs (first 5) ---") for i in range(min(5, len(predictions))): print(f"[{i}] Generated: {predictions[i][:200]}") print(f"[{i}] Reference: {references[i][:200]}") print() return metrics def _compute_metrics(predictions, references): metrics = {} try: from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction smooth = SmoothingFunction().method1 refs = [[ref.split()] for ref in references] preds = [pred.split() for pred in predictions] metrics["BLEU-1"] = corpus_bleu(refs, preds, weights=(1, 0, 0, 0), smoothing_function=smooth) metrics["BLEU-4"] = corpus_bleu(refs, preds, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smooth) except ImportError: print("Warning: nltk not installed. pip install nltk") try: from rouge_score import rouge_scorer scorer = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=True) scores = [scorer.score(ref, pred)["rougeL"].fmeasure for pred, ref in zip(predictions, references)] metrics["ROUGE-L"] = sum(scores) / len(scores) except ImportError: print("Warning: rouge-score not installed. pip install rouge-score") return metrics # ============================================================ # Model loading # ============================================================ def load_model(model_path, adapter_path=None, dtype=torch.float16): print(f"Loading Qwen2.5-VL from {model_path} ...") processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True) # Try Qwen2VL-specific class first, fall back to AutoModel try: model = Qwen2VLForConditionalGeneration.from_pretrained( model_path, torch_dtype=dtype, device_map="auto", trust_remote_code=True, ) except Exception: model = AutoModelForCausalLM.from_pretrained( model_path, torch_dtype=dtype, device_map="auto", trust_remote_code=True, ) # Load LoRA adapter if provided if adapter_path and os.path.exists(adapter_path): print(f"Loading adapter from {adapter_path} ...") from peft import PeftModel model = PeftModel.from_pretrained(model, adapter_path) model = model.merge_and_unload() print("Adapter merged.") model.eval() device = next(model.parameters()).device print(f"Model loaded on {device}") return model, processor # ============================================================ # Main # ============================================================ def main(): parser = argparse.ArgumentParser(description="Qwen2.5-VL-3B Evaluation") parser.add_argument("--mode", type=str, default="all", choices=["perplexity", "caption", "all"]) parser.add_argument("--model-path", type=str, required=True, help="Path to Qwen2.5-VL-3B-Instruct") parser.add_argument("--adapter-path", type=str, default=None, help="Path to LoRA/circulant adapter (optional)") parser.add_argument("--eval-data", type=str, required=True, help="Path to eval_qwenvl.jsonl") parser.add_argument("--max-samples", type=int, default=None) parser.add_argument("--max-new-tokens", type=int, default=256) parser.add_argument("--dtype", type=str, default="float16", choices=["float16", "bfloat16"]) parser.add_argument("--output", type=str, default=None) args = parser.parse_args() dtype = torch.float16 if args.dtype == "float16" else torch.bfloat16 model, processor = load_model(args.model_path, args.adapter_path, dtype) device = next(model.parameters()).device eval_data = load_eval_data(args.eval_data, max_samples=args.max_samples) model_name = "Qwen2.5-VL-3B" if args.adapter_path: model_name += f" + {os.path.basename(args.adapter_path)}" results = {"model": model_name, "num_samples": len(eval_data)} if args.mode in ("perplexity", "all"): ppl = evaluate_perplexity(model, processor, eval_data, device) results["perplexity"] = ppl if args.mode in ("caption", "all"): metrics = evaluate_caption( model, processor, eval_data, device, max_new_tokens=args.max_new_tokens) results.update(metrics) if args.output: os.makedirs(os.path.dirname(args.output) or ".", exist_ok=True) with open(args.output, "w") as f: json.dump(results, f, indent=2, ensure_ascii=False) print(f"\nResults saved to {args.output}") if __name__ == "__main__": main()