File size: 8,196 Bytes
2e7f2ce | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 | """
Qwen2.5-7B Text-Only Baseline Evaluation
Computes perplexity on the same held-out caption data WITHOUT images.
This serves as baseline: a pure text LLM shouldn't predict image captions well.
Usage:
python eval/eval_qwen_baseline.py \
--model-path qwen_models/models--Qwen--Qwen2.5-7B-Instruct/snapshots/a09a35458c702b33eeacc393d103063234e8bc28 \
--eval-data data_dir/VoRA-Recap-29M/eval_qwenvl.jsonl
"""
import argparse
import json
import math
import os
import sys
import torch
from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer
IGNORE_INDEX = -100
def load_eval_data(eval_path, max_samples=None):
data = []
with open(eval_path, "r") as f:
for line in f:
item = json.loads(line.strip())
data.append(item)
if max_samples and len(data) >= max_samples:
break
print(f"Loaded {len(data)} evaluation samples")
return data
def build_text_only_batch(tokenizer, caption, device):
"""Build prompt for text-only baseline.
Uses the same prompt template as VoRA, but replaces <image> with
a text instruction "Describe this image." (since there's no image).
"""
system_start = "<|im_start|>system\n"
system_message = "You are a helpful assistant."
system_end = "<|im_end|>"
user_start = "\n<|im_start|>user\n"
user_end = "<|im_end|>\n<|im_start|>assistant\n"
prompt = (system_start + system_message + system_end +
user_start + "Describe this image." + user_end)
prompt_ids = tokenizer.encode(prompt)
caption_ids = tokenizer.encode(caption)
eos_id = tokenizer.convert_tokens_to_ids("<|im_end|>")
full_ids = prompt_ids + caption_ids + [eos_id]
labels = [IGNORE_INDEX] * len(prompt_ids) + caption_ids + [eos_id]
batch = {
"input_ids": torch.tensor([full_ids], dtype=torch.long).to(device),
"attention_mask": torch.ones(1, len(full_ids), dtype=torch.long).to(device),
"labels": torch.tensor([labels], dtype=torch.long).to(device),
}
return batch, len(caption_ids) + 1
@torch.no_grad()
def evaluate_perplexity(model, tokenizer, eval_data, device):
model.eval()
total_loss = 0.0
total_tokens = 0
errors = 0
for i, item in enumerate(tqdm(eval_data, desc="Qwen Baseline Perplexity")):
caption = item["text"]
try:
batch, n_caption_tokens = build_text_only_batch(tokenizer, caption, device)
outputs = model(**batch)
loss = outputs.loss
total_loss += loss.item() * n_caption_tokens
total_tokens += n_caption_tokens
except Exception as e:
errors += 1
if errors <= 5:
print(f" Error on sample {i}: {e}")
continue
if total_tokens == 0:
print("No valid samples!")
return float("inf")
avg_loss = total_loss / total_tokens
perplexity = math.exp(avg_loss)
print(f"\n=== Qwen2.5-7B Text-Only Baseline ===")
print(f"Samples evaluated: {len(eval_data) - errors}/{len(eval_data)}")
print(f"Errors: {errors}")
print(f"Average cross-entropy loss: {avg_loss:.4f}")
print(f"Perplexity: {perplexity:.2f}")
return perplexity
@torch.no_grad()
def evaluate_caption(model, tokenizer, eval_data, device, max_new_tokens=256):
"""Generate captions without any image (text-only baseline)."""
model.eval()
predictions = []
references = []
system_start = "<|im_start|>system\n"
system_message = "You are a helpful assistant."
system_end = "<|im_end|>"
user_start = "\n<|im_start|>user\n"
user_end = "<|im_end|>\n<|im_start|>assistant\n"
prompt = (system_start + system_message + system_end +
user_start + "Describe this image." + user_end)
prompt_ids = tokenizer.encode(prompt)
eos_id = tokenizer.convert_tokens_to_ids("<|im_end|>")
for item in tqdm(eval_data, desc="Qwen Baseline Caption"):
try:
input_ids = torch.tensor([prompt_ids], dtype=torch.long).to(device)
attention_mask = torch.ones_like(input_ids)
outputs = model.generate(
input_ids=input_ids,
attention_mask=attention_mask,
max_new_tokens=max_new_tokens,
do_sample=False,
pad_token_id=tokenizer.eos_token_id,
eos_token_id=eos_id,
)
generated = outputs[0][len(prompt_ids):]
text = tokenizer.decode(generated, skip_special_tokens=True)
predictions.append(text)
references.append(item["text"])
except Exception as e:
continue
if predictions:
metrics = _compute_metrics(predictions, references)
print(f"\n=== Qwen Baseline Caption Results ===")
print(f"Samples: {len(predictions)}/{len(eval_data)}")
for k, v in metrics.items():
print(f"{k}: {v:.4f}")
print(f"\n--- Sample Outputs (first 3) ---")
for i in range(min(3, len(predictions))):
print(f"[{i}] Generated: {predictions[i][:200]}")
print(f"[{i}] Reference: {references[i][:200]}")
print()
return metrics
return {}
def _compute_metrics(predictions, references):
metrics = {}
try:
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
smooth = SmoothingFunction().method1
refs = [[ref.split()] for ref in references]
preds = [pred.split() for pred in predictions]
metrics["BLEU-1"] = corpus_bleu(refs, preds, weights=(1, 0, 0, 0), smoothing_function=smooth)
metrics["BLEU-4"] = corpus_bleu(refs, preds, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smooth)
except ImportError:
pass
try:
from rouge_score import rouge_scorer
scorer = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=True)
scores = [scorer.score(ref, pred)["rougeL"].fmeasure for pred, ref in zip(predictions, references)]
metrics["ROUGE-L"] = sum(scores) / len(scores)
except ImportError:
pass
return metrics
def main():
parser = argparse.ArgumentParser(description="Qwen2.5-7B Text-Only Baseline")
parser.add_argument("--mode", type=str, default="all",
choices=["perplexity", "caption", "all"])
parser.add_argument("--model-path", type=str, required=True,
help="Path to Qwen2.5-7B-Instruct")
parser.add_argument("--eval-data", type=str, required=True)
parser.add_argument("--max-samples", type=int, default=None)
parser.add_argument("--max-new-tokens", type=int, default=256)
parser.add_argument("--dtype", type=str, default="float16",
choices=["float16", "bfloat16"])
parser.add_argument("--output", type=str, default=None)
args = parser.parse_args()
dtype = torch.float16 if args.dtype == "float16" else torch.bfloat16
print(f"Loading Qwen2.5-7B from {args.model_path} ...")
model = AutoModelForCausalLM.from_pretrained(
args.model_path, torch_dtype=dtype, device_map="auto",
trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(args.model_path, trust_remote_code=True)
model.eval()
device = next(model.parameters()).device
print(f"Model loaded on {device}")
eval_data = load_eval_data(args.eval_data, max_samples=args.max_samples)
results = {"model": "Qwen2.5-7B-Instruct (text-only)", "num_samples": len(eval_data)}
if args.mode in ("perplexity", "all"):
ppl = evaluate_perplexity(model, tokenizer, eval_data, device)
results["perplexity"] = ppl
if args.mode in ("caption", "all"):
caption_metrics = evaluate_caption(
model, tokenizer, eval_data, device, max_new_tokens=args.max_new_tokens)
results.update(caption_metrics)
if args.output:
os.makedirs(os.path.dirname(args.output) or ".", exist_ok=True)
with open(args.output, "w") as f:
json.dump(results, f, indent=2, ensure_ascii=False)
print(f"\nResults saved to {args.output}")
if __name__ == "__main__":
main()
|