| """ |
| Qwen2.5-7B Text-Only Baseline Evaluation |
| Computes perplexity on the same held-out caption data WITHOUT images. |
| This serves as baseline: a pure text LLM shouldn't predict image captions well. |
| |
| Usage: |
| python eval/eval_qwen_baseline.py \ |
| --model-path qwen_models/models--Qwen--Qwen2.5-7B-Instruct/snapshots/a09a35458c702b33eeacc393d103063234e8bc28 \ |
| --eval-data data_dir/VoRA-Recap-29M/eval_qwenvl.jsonl |
| """ |
|
|
| import argparse |
| import json |
| import math |
| import os |
| import sys |
|
|
| import torch |
| from tqdm import tqdm |
| from transformers import AutoModelForCausalLM, AutoTokenizer |
|
|
| IGNORE_INDEX = -100 |
|
|
|
|
| def load_eval_data(eval_path, max_samples=None): |
| data = [] |
| with open(eval_path, "r") as f: |
| for line in f: |
| item = json.loads(line.strip()) |
| data.append(item) |
| if max_samples and len(data) >= max_samples: |
| break |
| print(f"Loaded {len(data)} evaluation samples") |
| return data |
|
|
|
|
| def build_text_only_batch(tokenizer, caption, device): |
| """Build prompt for text-only baseline. |
| |
| Uses the same prompt template as VoRA, but replaces <image> with |
| a text instruction "Describe this image." (since there's no image). |
| """ |
| system_start = "<|im_start|>system\n" |
| system_message = "You are a helpful assistant." |
| system_end = "<|im_end|>" |
| user_start = "\n<|im_start|>user\n" |
| user_end = "<|im_end|>\n<|im_start|>assistant\n" |
|
|
| prompt = (system_start + system_message + system_end + |
| user_start + "Describe this image." + user_end) |
|
|
| prompt_ids = tokenizer.encode(prompt) |
| caption_ids = tokenizer.encode(caption) |
| eos_id = tokenizer.convert_tokens_to_ids("<|im_end|>") |
| full_ids = prompt_ids + caption_ids + [eos_id] |
|
|
| labels = [IGNORE_INDEX] * len(prompt_ids) + caption_ids + [eos_id] |
|
|
| batch = { |
| "input_ids": torch.tensor([full_ids], dtype=torch.long).to(device), |
| "attention_mask": torch.ones(1, len(full_ids), dtype=torch.long).to(device), |
| "labels": torch.tensor([labels], dtype=torch.long).to(device), |
| } |
| return batch, len(caption_ids) + 1 |
|
|
|
|
| @torch.no_grad() |
| def evaluate_perplexity(model, tokenizer, eval_data, device): |
| model.eval() |
| total_loss = 0.0 |
| total_tokens = 0 |
| errors = 0 |
|
|
| for i, item in enumerate(tqdm(eval_data, desc="Qwen Baseline Perplexity")): |
| caption = item["text"] |
| try: |
| batch, n_caption_tokens = build_text_only_batch(tokenizer, caption, device) |
| outputs = model(**batch) |
| loss = outputs.loss |
| total_loss += loss.item() * n_caption_tokens |
| total_tokens += n_caption_tokens |
| except Exception as e: |
| errors += 1 |
| if errors <= 5: |
| print(f" Error on sample {i}: {e}") |
| continue |
|
|
| if total_tokens == 0: |
| print("No valid samples!") |
| return float("inf") |
|
|
| avg_loss = total_loss / total_tokens |
| perplexity = math.exp(avg_loss) |
| print(f"\n=== Qwen2.5-7B Text-Only Baseline ===") |
| print(f"Samples evaluated: {len(eval_data) - errors}/{len(eval_data)}") |
| print(f"Errors: {errors}") |
| print(f"Average cross-entropy loss: {avg_loss:.4f}") |
| print(f"Perplexity: {perplexity:.2f}") |
| return perplexity |
|
|
|
|
| @torch.no_grad() |
| def evaluate_caption(model, tokenizer, eval_data, device, max_new_tokens=256): |
| """Generate captions without any image (text-only baseline).""" |
| model.eval() |
| predictions = [] |
| references = [] |
|
|
| system_start = "<|im_start|>system\n" |
| system_message = "You are a helpful assistant." |
| system_end = "<|im_end|>" |
| user_start = "\n<|im_start|>user\n" |
| user_end = "<|im_end|>\n<|im_start|>assistant\n" |
| prompt = (system_start + system_message + system_end + |
| user_start + "Describe this image." + user_end) |
| prompt_ids = tokenizer.encode(prompt) |
| eos_id = tokenizer.convert_tokens_to_ids("<|im_end|>") |
|
|
| for item in tqdm(eval_data, desc="Qwen Baseline Caption"): |
| try: |
| input_ids = torch.tensor([prompt_ids], dtype=torch.long).to(device) |
| attention_mask = torch.ones_like(input_ids) |
|
|
| outputs = model.generate( |
| input_ids=input_ids, |
| attention_mask=attention_mask, |
| max_new_tokens=max_new_tokens, |
| do_sample=False, |
| pad_token_id=tokenizer.eos_token_id, |
| eos_token_id=eos_id, |
| ) |
|
|
| generated = outputs[0][len(prompt_ids):] |
| text = tokenizer.decode(generated, skip_special_tokens=True) |
| predictions.append(text) |
| references.append(item["text"]) |
| except Exception as e: |
| continue |
|
|
| if predictions: |
| metrics = _compute_metrics(predictions, references) |
| print(f"\n=== Qwen Baseline Caption Results ===") |
| print(f"Samples: {len(predictions)}/{len(eval_data)}") |
| for k, v in metrics.items(): |
| print(f"{k}: {v:.4f}") |
|
|
| print(f"\n--- Sample Outputs (first 3) ---") |
| for i in range(min(3, len(predictions))): |
| print(f"[{i}] Generated: {predictions[i][:200]}") |
| print(f"[{i}] Reference: {references[i][:200]}") |
| print() |
| return metrics |
| return {} |
|
|
|
|
| def _compute_metrics(predictions, references): |
| metrics = {} |
| try: |
| from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction |
| smooth = SmoothingFunction().method1 |
| refs = [[ref.split()] for ref in references] |
| preds = [pred.split() for pred in predictions] |
| metrics["BLEU-1"] = corpus_bleu(refs, preds, weights=(1, 0, 0, 0), smoothing_function=smooth) |
| metrics["BLEU-4"] = corpus_bleu(refs, preds, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smooth) |
| except ImportError: |
| pass |
| try: |
| from rouge_score import rouge_scorer |
| scorer = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=True) |
| scores = [scorer.score(ref, pred)["rougeL"].fmeasure for pred, ref in zip(predictions, references)] |
| metrics["ROUGE-L"] = sum(scores) / len(scores) |
| except ImportError: |
| pass |
| return metrics |
|
|
|
|
| def main(): |
| parser = argparse.ArgumentParser(description="Qwen2.5-7B Text-Only Baseline") |
| parser.add_argument("--mode", type=str, default="all", |
| choices=["perplexity", "caption", "all"]) |
| parser.add_argument("--model-path", type=str, required=True, |
| help="Path to Qwen2.5-7B-Instruct") |
| parser.add_argument("--eval-data", type=str, required=True) |
| parser.add_argument("--max-samples", type=int, default=None) |
| parser.add_argument("--max-new-tokens", type=int, default=256) |
| parser.add_argument("--dtype", type=str, default="float16", |
| choices=["float16", "bfloat16"]) |
| parser.add_argument("--output", type=str, default=None) |
| args = parser.parse_args() |
|
|
| dtype = torch.float16 if args.dtype == "float16" else torch.bfloat16 |
|
|
| print(f"Loading Qwen2.5-7B from {args.model_path} ...") |
| model = AutoModelForCausalLM.from_pretrained( |
| args.model_path, torch_dtype=dtype, device_map="auto", |
| trust_remote_code=True) |
| tokenizer = AutoTokenizer.from_pretrained(args.model_path, trust_remote_code=True) |
| model.eval() |
| device = next(model.parameters()).device |
| print(f"Model loaded on {device}") |
|
|
| eval_data = load_eval_data(args.eval_data, max_samples=args.max_samples) |
| results = {"model": "Qwen2.5-7B-Instruct (text-only)", "num_samples": len(eval_data)} |
|
|
| if args.mode in ("perplexity", "all"): |
| ppl = evaluate_perplexity(model, tokenizer, eval_data, device) |
| results["perplexity"] = ppl |
|
|
| if args.mode in ("caption", "all"): |
| caption_metrics = evaluate_caption( |
| model, tokenizer, eval_data, device, max_new_tokens=args.max_new_tokens) |
| results.update(caption_metrics) |
|
|
| if args.output: |
| os.makedirs(os.path.dirname(args.output) or ".", exist_ok=True) |
| with open(args.output, "w") as f: |
| json.dump(results, f, indent=2, ensure_ascii=False) |
| print(f"\nResults saved to {args.output}") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|