File size: 11,819 Bytes

2e7f2ce

"""
Qwen2.5-VL-3B Evaluation Script
Evaluates the original Qwen2.5-VL-3B-Instruct (with vision) on held-out caption data.
Also supports evaluating LoRA / block-circulant finetuned versions if checkpoints exist.

Usage:
  # Original model
  python eval/eval_qwen_vl.py --mode all \
      --model-path Finetune-Qwen2.5-VL/Qwen2.5-VL-3B-Instruct \
      --eval-data data_dir/VoRA-Recap-29M/eval_qwenvl.jsonl

  # With LoRA adapter
  python eval/eval_qwen_vl.py --mode all \
      --model-path Finetune-Qwen2.5-VL/Qwen2.5-VL-3B-Instruct \
      --adapter-path Finetune-Qwen2.5-VL/saves/Qwen2.5-VL-3B-Instruct/lora \
      --eval-data data_dir/VoRA-Recap-29M/eval_qwenvl.jsonl
"""

import argparse
import json
import math
import os
import sys

import torch
from PIL import Image
from tqdm import tqdm
from transformers import (
    AutoModelForCausalLM,
    AutoProcessor,
    AutoTokenizer,
    Qwen2VLForConditionalGeneration,
)

IGNORE_INDEX = -100


# ============================================================
# Data loading
# ============================================================

def load_eval_data(eval_path, max_samples=None):
    data = []
    with open(eval_path, "r") as f:
        for line in f:
            item = json.loads(line.strip())
            data.append(item)
            if max_samples and len(data) >= max_samples:
                break
    print(f"Loaded {len(data)} evaluation samples")
    return data


# ============================================================
# Build inputs for Qwen2.5-VL
# ============================================================

def build_messages(image_path, caption=None):
    """Build Qwen2.5-VL chat messages for image captioning."""
    messages = [
        {
            "role": "system",
            "content": [{"type": "text", "text": "You are a helpful assistant."}],
        },
        {
            "role": "user",
            "content": [
                {"type": "image", "image": f"file://{os.path.abspath(image_path)}"},
                {"type": "text", "text": "Describe this image."},
            ],
        },
    ]
    if caption is not None:
        # For perplexity: add assistant response
        messages.append({
            "role": "assistant",
            "content": [{"type": "text", "text": caption}],
        })
    return messages


def prepare_perplexity_inputs(processor, image_path, caption, device):
    """Prepare inputs for perplexity evaluation (with labels)."""
    # Full messages with the ground truth caption as assistant response
    messages_full = build_messages(image_path, caption=caption)
    text_full = processor.apply_chat_template(
        messages_full, tokenize=False, add_generation_prompt=False)

    # Prompt-only (no assistant response) to find where caption starts
    messages_prompt = build_messages(image_path, caption=None)
    text_prompt = processor.apply_chat_template(
        messages_prompt, tokenize=False, add_generation_prompt=True)

    # Process full input with image
    image = Image.open(image_path).convert("RGB")
    inputs_full = processor(
        text=[text_full], images=[image], padding=True, return_tensors="pt"
    ).to(device)
    inputs_prompt = processor(
        text=[text_prompt], images=[image], padding=True, return_tensors="pt"
    ).to(device)

    # Create labels: mask out prompt tokens
    input_ids = inputs_full["input_ids"]
    prompt_len = inputs_prompt["input_ids"].shape[1]
    labels = input_ids.clone()
    labels[:, :prompt_len] = IGNORE_INDEX

    n_caption_tokens = int((labels != IGNORE_INDEX).sum().item())
    inputs_full["labels"] = labels

    return inputs_full, n_caption_tokens


def prepare_generation_inputs(processor, image_path, device):
    """Prepare inputs for caption generation."""
    messages = build_messages(image_path, caption=None)
    text = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True)

    image = Image.open(image_path).convert("RGB")
    inputs = processor(
        text=[text], images=[image], padding=True, return_tensors="pt"
    ).to(device)
    return inputs


# ============================================================
# Evaluation: Perplexity
# ============================================================

@torch.no_grad()
def evaluate_perplexity(model, processor, eval_data, device):
    model.eval()
    total_loss = 0.0
    total_tokens = 0
    errors = 0

    for i, item in enumerate(tqdm(eval_data, desc="Qwen-VL Perplexity")):
        image_path = item["image"]
        caption = item["text"]

        if not os.path.exists(image_path):
            errors += 1
            continue

        try:
            inputs, n_tokens = prepare_perplexity_inputs(
                processor, image_path, caption, device)
            outputs = model(**inputs)
            loss = outputs.loss
            total_loss += loss.item() * n_tokens
            total_tokens += n_tokens
        except Exception as e:
            errors += 1
            if errors <= 5:
                print(f"  Error on sample {i}: {e}")
            continue

    if total_tokens == 0:
        print("No valid samples!")
        return float("inf")

    avg_loss = total_loss / total_tokens
    perplexity = math.exp(avg_loss)
    print(f"\n=== Qwen2.5-VL Perplexity Results ===")
    print(f"Samples: {len(eval_data) - errors}/{len(eval_data)}")
    print(f"Errors: {errors}")
    print(f"Average CE loss: {avg_loss:.4f}")
    print(f"Perplexity: {perplexity:.2f}")
    return perplexity


# ============================================================
# Evaluation: Caption Generation
# ============================================================

@torch.no_grad()
def evaluate_caption(model, processor, eval_data, device, max_new_tokens=256):
    model.eval()
    predictions = []
    references = []
    errors = 0

    for i, item in enumerate(tqdm(eval_data, desc="Qwen-VL Caption")):
        image_path = item["image"]
        caption = item["text"]

        if not os.path.exists(image_path):
            errors += 1
            continue

        try:
            inputs = prepare_generation_inputs(processor, image_path, device)
            prompt_len = inputs["input_ids"].shape[1]

            outputs = model.generate(
                **inputs,
                max_new_tokens=max_new_tokens,
                do_sample=False,
            )

            generated = outputs[0][prompt_len:]
            text = processor.tokenizer.decode(generated, skip_special_tokens=True)
            predictions.append(text)
            references.append(caption)
        except Exception as e:
            errors += 1
            if errors <= 5:
                print(f"  Error on sample {i}: {e}")
            continue

    if not predictions:
        print("No valid samples!")
        return {}

    metrics = _compute_metrics(predictions, references)
    print(f"\n=== Qwen2.5-VL Caption Results ===")
    print(f"Samples: {len(predictions)}/{len(eval_data)}")
    print(f"Errors: {errors}")
    for k, v in metrics.items():
        print(f"{k}: {v:.4f}")

    print(f"\n--- Sample Outputs (first 5) ---")
    for i in range(min(5, len(predictions))):
        print(f"[{i}] Generated: {predictions[i][:200]}")
        print(f"[{i}] Reference: {references[i][:200]}")
        print()

    return metrics


def _compute_metrics(predictions, references):
    metrics = {}
    try:
        from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
        smooth = SmoothingFunction().method1
        refs = [[ref.split()] for ref in references]
        preds = [pred.split() for pred in predictions]
        metrics["BLEU-1"] = corpus_bleu(refs, preds, weights=(1, 0, 0, 0), smoothing_function=smooth)
        metrics["BLEU-4"] = corpus_bleu(refs, preds, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smooth)
    except ImportError:
        print("Warning: nltk not installed. pip install nltk")
    try:
        from rouge_score import rouge_scorer
        scorer = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=True)
        scores = [scorer.score(ref, pred)["rougeL"].fmeasure for pred, ref in zip(predictions, references)]
        metrics["ROUGE-L"] = sum(scores) / len(scores)
    except ImportError:
        print("Warning: rouge-score not installed. pip install rouge-score")
    return metrics


# ============================================================
# Model loading
# ============================================================

def load_model(model_path, adapter_path=None, dtype=torch.float16):
    print(f"Loading Qwen2.5-VL from {model_path} ...")
    processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)

    # Try Qwen2VL-specific class first, fall back to AutoModel
    try:
        model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_path,
            torch_dtype=dtype,
            device_map="auto",
            trust_remote_code=True,
        )
    except Exception:
        model = AutoModelForCausalLM.from_pretrained(
            model_path,
            torch_dtype=dtype,
            device_map="auto",
            trust_remote_code=True,
        )

    # Load LoRA adapter if provided
    if adapter_path and os.path.exists(adapter_path):
        print(f"Loading adapter from {adapter_path} ...")
        from peft import PeftModel
        model = PeftModel.from_pretrained(model, adapter_path)
        model = model.merge_and_unload()
        print("Adapter merged.")

    model.eval()
    device = next(model.parameters()).device
    print(f"Model loaded on {device}")
    return model, processor


# ============================================================
# Main
# ============================================================

def main():
    parser = argparse.ArgumentParser(description="Qwen2.5-VL-3B Evaluation")
    parser.add_argument("--mode", type=str, default="all",
                        choices=["perplexity", "caption", "all"])
    parser.add_argument("--model-path", type=str, required=True,
                        help="Path to Qwen2.5-VL-3B-Instruct")
    parser.add_argument("--adapter-path", type=str, default=None,
                        help="Path to LoRA/circulant adapter (optional)")
    parser.add_argument("--eval-data", type=str, required=True,
                        help="Path to eval_qwenvl.jsonl")
    parser.add_argument("--max-samples", type=int, default=None)
    parser.add_argument("--max-new-tokens", type=int, default=256)
    parser.add_argument("--dtype", type=str, default="float16",
                        choices=["float16", "bfloat16"])
    parser.add_argument("--output", type=str, default=None)
    args = parser.parse_args()

    dtype = torch.float16 if args.dtype == "float16" else torch.bfloat16
    model, processor = load_model(args.model_path, args.adapter_path, dtype)
    device = next(model.parameters()).device

    eval_data = load_eval_data(args.eval_data, max_samples=args.max_samples)

    model_name = "Qwen2.5-VL-3B"
    if args.adapter_path:
        model_name += f" + {os.path.basename(args.adapter_path)}"
    results = {"model": model_name, "num_samples": len(eval_data)}

    if args.mode in ("perplexity", "all"):
        ppl = evaluate_perplexity(model, processor, eval_data, device)
        results["perplexity"] = ppl

    if args.mode in ("caption", "all"):
        metrics = evaluate_caption(
            model, processor, eval_data, device, max_new_tokens=args.max_new_tokens)
        results.update(metrics)

    if args.output:
        os.makedirs(os.path.dirname(args.output) or ".", exist_ok=True)
        with open(args.output, "w") as f:
            json.dump(results, f, indent=2, ensure_ascii=False)
        print(f"\nResults saved to {args.output}")


if __name__ == "__main__":
    main()