""" Qwen2.5-7B Text-Only Baseline Evaluation Computes perplexity on the same held-out caption data WITHOUT images. This serves as baseline: a pure text LLM shouldn't predict image captions well. Usage: python eval/eval_qwen_baseline.py \ --model-path qwen_models/models--Qwen--Qwen2.5-7B-Instruct/snapshots/a09a35458c702b33eeacc393d103063234e8bc28 \ --eval-data data_dir/VoRA-Recap-29M/eval_qwenvl.jsonl """ import argparse import json import math import os import sys import torch from tqdm import tqdm from transformers import AutoModelForCausalLM, AutoTokenizer IGNORE_INDEX = -100 def load_eval_data(eval_path, max_samples=None): data = [] with open(eval_path, "r") as f: for line in f: item = json.loads(line.strip()) data.append(item) if max_samples and len(data) >= max_samples: break print(f"Loaded {len(data)} evaluation samples") return data def build_text_only_batch(tokenizer, caption, device): """Build prompt for text-only baseline. Uses the same prompt template as VoRA, but replaces with a text instruction "Describe this image." (since there's no image). """ system_start = "<|im_start|>system\n" system_message = "You are a helpful assistant." system_end = "<|im_end|>" user_start = "\n<|im_start|>user\n" user_end = "<|im_end|>\n<|im_start|>assistant\n" prompt = (system_start + system_message + system_end + user_start + "Describe this image." + user_end) prompt_ids = tokenizer.encode(prompt) caption_ids = tokenizer.encode(caption) eos_id = tokenizer.convert_tokens_to_ids("<|im_end|>") full_ids = prompt_ids + caption_ids + [eos_id] labels = [IGNORE_INDEX] * len(prompt_ids) + caption_ids + [eos_id] batch = { "input_ids": torch.tensor([full_ids], dtype=torch.long).to(device), "attention_mask": torch.ones(1, len(full_ids), dtype=torch.long).to(device), "labels": torch.tensor([labels], dtype=torch.long).to(device), } return batch, len(caption_ids) + 1 @torch.no_grad() def evaluate_perplexity(model, tokenizer, eval_data, device): model.eval() total_loss = 0.0 total_tokens = 0 errors = 0 for i, item in enumerate(tqdm(eval_data, desc="Qwen Baseline Perplexity")): caption = item["text"] try: batch, n_caption_tokens = build_text_only_batch(tokenizer, caption, device) outputs = model(**batch) loss = outputs.loss total_loss += loss.item() * n_caption_tokens total_tokens += n_caption_tokens except Exception as e: errors += 1 if errors <= 5: print(f" Error on sample {i}: {e}") continue if total_tokens == 0: print("No valid samples!") return float("inf") avg_loss = total_loss / total_tokens perplexity = math.exp(avg_loss) print(f"\n=== Qwen2.5-7B Text-Only Baseline ===") print(f"Samples evaluated: {len(eval_data) - errors}/{len(eval_data)}") print(f"Errors: {errors}") print(f"Average cross-entropy loss: {avg_loss:.4f}") print(f"Perplexity: {perplexity:.2f}") return perplexity @torch.no_grad() def evaluate_caption(model, tokenizer, eval_data, device, max_new_tokens=256): """Generate captions without any image (text-only baseline).""" model.eval() predictions = [] references = [] system_start = "<|im_start|>system\n" system_message = "You are a helpful assistant." system_end = "<|im_end|>" user_start = "\n<|im_start|>user\n" user_end = "<|im_end|>\n<|im_start|>assistant\n" prompt = (system_start + system_message + system_end + user_start + "Describe this image." + user_end) prompt_ids = tokenizer.encode(prompt) eos_id = tokenizer.convert_tokens_to_ids("<|im_end|>") for item in tqdm(eval_data, desc="Qwen Baseline Caption"): try: input_ids = torch.tensor([prompt_ids], dtype=torch.long).to(device) attention_mask = torch.ones_like(input_ids) outputs = model.generate( input_ids=input_ids, attention_mask=attention_mask, max_new_tokens=max_new_tokens, do_sample=False, pad_token_id=tokenizer.eos_token_id, eos_token_id=eos_id, ) generated = outputs[0][len(prompt_ids):] text = tokenizer.decode(generated, skip_special_tokens=True) predictions.append(text) references.append(item["text"]) except Exception as e: continue if predictions: metrics = _compute_metrics(predictions, references) print(f"\n=== Qwen Baseline Caption Results ===") print(f"Samples: {len(predictions)}/{len(eval_data)}") for k, v in metrics.items(): print(f"{k}: {v:.4f}") print(f"\n--- Sample Outputs (first 3) ---") for i in range(min(3, len(predictions))): print(f"[{i}] Generated: {predictions[i][:200]}") print(f"[{i}] Reference: {references[i][:200]}") print() return metrics return {} def _compute_metrics(predictions, references): metrics = {} try: from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction smooth = SmoothingFunction().method1 refs = [[ref.split()] for ref in references] preds = [pred.split() for pred in predictions] metrics["BLEU-1"] = corpus_bleu(refs, preds, weights=(1, 0, 0, 0), smoothing_function=smooth) metrics["BLEU-4"] = corpus_bleu(refs, preds, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smooth) except ImportError: pass try: from rouge_score import rouge_scorer scorer = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=True) scores = [scorer.score(ref, pred)["rougeL"].fmeasure for pred, ref in zip(predictions, references)] metrics["ROUGE-L"] = sum(scores) / len(scores) except ImportError: pass return metrics def main(): parser = argparse.ArgumentParser(description="Qwen2.5-7B Text-Only Baseline") parser.add_argument("--mode", type=str, default="all", choices=["perplexity", "caption", "all"]) parser.add_argument("--model-path", type=str, required=True, help="Path to Qwen2.5-7B-Instruct") parser.add_argument("--eval-data", type=str, required=True) parser.add_argument("--max-samples", type=int, default=None) parser.add_argument("--max-new-tokens", type=int, default=256) parser.add_argument("--dtype", type=str, default="float16", choices=["float16", "bfloat16"]) parser.add_argument("--output", type=str, default=None) args = parser.parse_args() dtype = torch.float16 if args.dtype == "float16" else torch.bfloat16 print(f"Loading Qwen2.5-7B from {args.model_path} ...") model = AutoModelForCausalLM.from_pretrained( args.model_path, torch_dtype=dtype, device_map="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(args.model_path, trust_remote_code=True) model.eval() device = next(model.parameters()).device print(f"Model loaded on {device}") eval_data = load_eval_data(args.eval_data, max_samples=args.max_samples) results = {"model": "Qwen2.5-7B-Instruct (text-only)", "num_samples": len(eval_data)} if args.mode in ("perplexity", "all"): ppl = evaluate_perplexity(model, tokenizer, eval_data, device) results["perplexity"] = ppl if args.mode in ("caption", "all"): caption_metrics = evaluate_caption( model, tokenizer, eval_data, device, max_new_tokens=args.max_new_tokens) results.update(caption_metrics) if args.output: os.makedirs(os.path.dirname(args.output) or ".", exist_ok=True) with open(args.output, "w") as f: json.dump(results, f, indent=2, ensure_ascii=False) print(f"\nResults saved to {args.output}") if __name__ == "__main__": main()