| """ |
| Qwen2.5-VL-3B Evaluation Script |
| Evaluates the original Qwen2.5-VL-3B-Instruct (with vision) on held-out caption data. |
| Also supports evaluating LoRA / block-circulant finetuned versions if checkpoints exist. |
| |
| Usage: |
| # Original model |
| python eval/eval_qwen_vl.py --mode all \ |
| --model-path Finetune-Qwen2.5-VL/Qwen2.5-VL-3B-Instruct \ |
| --eval-data data_dir/VoRA-Recap-29M/eval_qwenvl.jsonl |
| |
| # With LoRA adapter |
| python eval/eval_qwen_vl.py --mode all \ |
| --model-path Finetune-Qwen2.5-VL/Qwen2.5-VL-3B-Instruct \ |
| --adapter-path Finetune-Qwen2.5-VL/saves/Qwen2.5-VL-3B-Instruct/lora \ |
| --eval-data data_dir/VoRA-Recap-29M/eval_qwenvl.jsonl |
| """ |
|
|
| import argparse |
| import json |
| import math |
| import os |
| import sys |
|
|
| import torch |
| from PIL import Image |
| from tqdm import tqdm |
| from transformers import ( |
| AutoModelForCausalLM, |
| AutoProcessor, |
| AutoTokenizer, |
| Qwen2VLForConditionalGeneration, |
| ) |
|
|
| IGNORE_INDEX = -100 |
|
|
|
|
| |
| |
| |
|
|
| def load_eval_data(eval_path, max_samples=None): |
| data = [] |
| with open(eval_path, "r") as f: |
| for line in f: |
| item = json.loads(line.strip()) |
| data.append(item) |
| if max_samples and len(data) >= max_samples: |
| break |
| print(f"Loaded {len(data)} evaluation samples") |
| return data |
|
|
|
|
| |
| |
| |
|
|
| def build_messages(image_path, caption=None): |
| """Build Qwen2.5-VL chat messages for image captioning.""" |
| messages = [ |
| { |
| "role": "system", |
| "content": [{"type": "text", "text": "You are a helpful assistant."}], |
| }, |
| { |
| "role": "user", |
| "content": [ |
| {"type": "image", "image": f"file://{os.path.abspath(image_path)}"}, |
| {"type": "text", "text": "Describe this image."}, |
| ], |
| }, |
| ] |
| if caption is not None: |
| |
| messages.append({ |
| "role": "assistant", |
| "content": [{"type": "text", "text": caption}], |
| }) |
| return messages |
|
|
|
|
| def prepare_perplexity_inputs(processor, image_path, caption, device): |
| """Prepare inputs for perplexity evaluation (with labels).""" |
| |
| messages_full = build_messages(image_path, caption=caption) |
| text_full = processor.apply_chat_template( |
| messages_full, tokenize=False, add_generation_prompt=False) |
|
|
| |
| messages_prompt = build_messages(image_path, caption=None) |
| text_prompt = processor.apply_chat_template( |
| messages_prompt, tokenize=False, add_generation_prompt=True) |
|
|
| |
| image = Image.open(image_path).convert("RGB") |
| inputs_full = processor( |
| text=[text_full], images=[image], padding=True, return_tensors="pt" |
| ).to(device) |
| inputs_prompt = processor( |
| text=[text_prompt], images=[image], padding=True, return_tensors="pt" |
| ).to(device) |
|
|
| |
| input_ids = inputs_full["input_ids"] |
| prompt_len = inputs_prompt["input_ids"].shape[1] |
| labels = input_ids.clone() |
| labels[:, :prompt_len] = IGNORE_INDEX |
|
|
| n_caption_tokens = int((labels != IGNORE_INDEX).sum().item()) |
| inputs_full["labels"] = labels |
|
|
| return inputs_full, n_caption_tokens |
|
|
|
|
| def prepare_generation_inputs(processor, image_path, device): |
| """Prepare inputs for caption generation.""" |
| messages = build_messages(image_path, caption=None) |
| text = processor.apply_chat_template( |
| messages, tokenize=False, add_generation_prompt=True) |
|
|
| image = Image.open(image_path).convert("RGB") |
| inputs = processor( |
| text=[text], images=[image], padding=True, return_tensors="pt" |
| ).to(device) |
| return inputs |
|
|
|
|
| |
| |
| |
|
|
| @torch.no_grad() |
| def evaluate_perplexity(model, processor, eval_data, device): |
| model.eval() |
| total_loss = 0.0 |
| total_tokens = 0 |
| errors = 0 |
|
|
| for i, item in enumerate(tqdm(eval_data, desc="Qwen-VL Perplexity")): |
| image_path = item["image"] |
| caption = item["text"] |
|
|
| if not os.path.exists(image_path): |
| errors += 1 |
| continue |
|
|
| try: |
| inputs, n_tokens = prepare_perplexity_inputs( |
| processor, image_path, caption, device) |
| outputs = model(**inputs) |
| loss = outputs.loss |
| total_loss += loss.item() * n_tokens |
| total_tokens += n_tokens |
| except Exception as e: |
| errors += 1 |
| if errors <= 5: |
| print(f" Error on sample {i}: {e}") |
| continue |
|
|
| if total_tokens == 0: |
| print("No valid samples!") |
| return float("inf") |
|
|
| avg_loss = total_loss / total_tokens |
| perplexity = math.exp(avg_loss) |
| print(f"\n=== Qwen2.5-VL Perplexity Results ===") |
| print(f"Samples: {len(eval_data) - errors}/{len(eval_data)}") |
| print(f"Errors: {errors}") |
| print(f"Average CE loss: {avg_loss:.4f}") |
| print(f"Perplexity: {perplexity:.2f}") |
| return perplexity |
|
|
|
|
| |
| |
| |
|
|
| @torch.no_grad() |
| def evaluate_caption(model, processor, eval_data, device, max_new_tokens=256): |
| model.eval() |
| predictions = [] |
| references = [] |
| errors = 0 |
|
|
| for i, item in enumerate(tqdm(eval_data, desc="Qwen-VL Caption")): |
| image_path = item["image"] |
| caption = item["text"] |
|
|
| if not os.path.exists(image_path): |
| errors += 1 |
| continue |
|
|
| try: |
| inputs = prepare_generation_inputs(processor, image_path, device) |
| prompt_len = inputs["input_ids"].shape[1] |
|
|
| outputs = model.generate( |
| **inputs, |
| max_new_tokens=max_new_tokens, |
| do_sample=False, |
| ) |
|
|
| generated = outputs[0][prompt_len:] |
| text = processor.tokenizer.decode(generated, skip_special_tokens=True) |
| predictions.append(text) |
| references.append(caption) |
| except Exception as e: |
| errors += 1 |
| if errors <= 5: |
| print(f" Error on sample {i}: {e}") |
| continue |
|
|
| if not predictions: |
| print("No valid samples!") |
| return {} |
|
|
| metrics = _compute_metrics(predictions, references) |
| print(f"\n=== Qwen2.5-VL Caption Results ===") |
| print(f"Samples: {len(predictions)}/{len(eval_data)}") |
| print(f"Errors: {errors}") |
| for k, v in metrics.items(): |
| print(f"{k}: {v:.4f}") |
|
|
| print(f"\n--- Sample Outputs (first 5) ---") |
| for i in range(min(5, len(predictions))): |
| print(f"[{i}] Generated: {predictions[i][:200]}") |
| print(f"[{i}] Reference: {references[i][:200]}") |
| print() |
|
|
| return metrics |
|
|
|
|
| def _compute_metrics(predictions, references): |
| metrics = {} |
| try: |
| from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction |
| smooth = SmoothingFunction().method1 |
| refs = [[ref.split()] for ref in references] |
| preds = [pred.split() for pred in predictions] |
| metrics["BLEU-1"] = corpus_bleu(refs, preds, weights=(1, 0, 0, 0), smoothing_function=smooth) |
| metrics["BLEU-4"] = corpus_bleu(refs, preds, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smooth) |
| except ImportError: |
| print("Warning: nltk not installed. pip install nltk") |
| try: |
| from rouge_score import rouge_scorer |
| scorer = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=True) |
| scores = [scorer.score(ref, pred)["rougeL"].fmeasure for pred, ref in zip(predictions, references)] |
| metrics["ROUGE-L"] = sum(scores) / len(scores) |
| except ImportError: |
| print("Warning: rouge-score not installed. pip install rouge-score") |
| return metrics |
|
|
|
|
| |
| |
| |
|
|
| def load_model(model_path, adapter_path=None, dtype=torch.float16): |
| print(f"Loading Qwen2.5-VL from {model_path} ...") |
| processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True) |
|
|
| |
| try: |
| model = Qwen2VLForConditionalGeneration.from_pretrained( |
| model_path, |
| torch_dtype=dtype, |
| device_map="auto", |
| trust_remote_code=True, |
| ) |
| except Exception: |
| model = AutoModelForCausalLM.from_pretrained( |
| model_path, |
| torch_dtype=dtype, |
| device_map="auto", |
| trust_remote_code=True, |
| ) |
|
|
| |
| if adapter_path and os.path.exists(adapter_path): |
| print(f"Loading adapter from {adapter_path} ...") |
| from peft import PeftModel |
| model = PeftModel.from_pretrained(model, adapter_path) |
| model = model.merge_and_unload() |
| print("Adapter merged.") |
|
|
| model.eval() |
| device = next(model.parameters()).device |
| print(f"Model loaded on {device}") |
| return model, processor |
|
|
|
|
| |
| |
| |
|
|
| def main(): |
| parser = argparse.ArgumentParser(description="Qwen2.5-VL-3B Evaluation") |
| parser.add_argument("--mode", type=str, default="all", |
| choices=["perplexity", "caption", "all"]) |
| parser.add_argument("--model-path", type=str, required=True, |
| help="Path to Qwen2.5-VL-3B-Instruct") |
| parser.add_argument("--adapter-path", type=str, default=None, |
| help="Path to LoRA/circulant adapter (optional)") |
| parser.add_argument("--eval-data", type=str, required=True, |
| help="Path to eval_qwenvl.jsonl") |
| parser.add_argument("--max-samples", type=int, default=None) |
| parser.add_argument("--max-new-tokens", type=int, default=256) |
| parser.add_argument("--dtype", type=str, default="float16", |
| choices=["float16", "bfloat16"]) |
| parser.add_argument("--output", type=str, default=None) |
| args = parser.parse_args() |
|
|
| dtype = torch.float16 if args.dtype == "float16" else torch.bfloat16 |
| model, processor = load_model(args.model_path, args.adapter_path, dtype) |
| device = next(model.parameters()).device |
|
|
| eval_data = load_eval_data(args.eval_data, max_samples=args.max_samples) |
|
|
| model_name = "Qwen2.5-VL-3B" |
| if args.adapter_path: |
| model_name += f" + {os.path.basename(args.adapter_path)}" |
| results = {"model": model_name, "num_samples": len(eval_data)} |
|
|
| if args.mode in ("perplexity", "all"): |
| ppl = evaluate_perplexity(model, processor, eval_data, device) |
| results["perplexity"] = ppl |
|
|
| if args.mode in ("caption", "all"): |
| metrics = evaluate_caption( |
| model, processor, eval_data, device, max_new_tokens=args.max_new_tokens) |
| results.update(metrics) |
|
|
| if args.output: |
| os.makedirs(os.path.dirname(args.output) or ".", exist_ok=True) |
| with open(args.output, "w") as f: |
| json.dump(results, f, indent=2, ensure_ascii=False) |
| print(f"\nResults saved to {args.output}") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|