CaptionIQ / src /evaluate.py
pavanpraneeth's picture
Upload folder using huggingface_hub
290f366 verified
Raw
History Blame Contribute Delete
7.58 kB
"""
CaptionIQ — BLEU Score Evaluation
Generate captions for the test set and compute BLEU-1 through BLEU-4 scores.
Compare VGG16 vs VGG19 performance.
"""
import os
import sys
import json
import argparse
import numpy as np
from tqdm import tqdm
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from src.config import (
CAPTIONS_FILE, TOKENIZER_FILE,
VGG16_FEATURES_FILE, VGG19_FEATURES_FILE,
VGG16_MODEL_FILE, VGG19_MODEL_FILE,
TEST_IMAGES_FILE, BLEU_RESULTS_FILE,
START_TOKEN, END_TOKEN, MAX_LENGTH,
)
from src.utils import load_captions, load_image_list, load_features, load_tokenizer, get_vocab_size
from src.inference import greedy_search, beam_search
from src.config import BEAM_WIDTH
from tensorflow.keras.models import load_model
# ── Keras compatibility patch ──────────────────────────────────────
# Models saved with newer Keras include 'quantization_config' in every
# layer config. Older Keras versions reject the unknown kwarg.
# The Embedding layer's mask_zero=True also serialises a NotEqual op
# that the legacy h5 loader can't resolve by name.
# Fix: (1) monkey-patch from_config to strip quantization_config,
# (2) import NotEqual so we can register it as a custom object.
import keras.src.ops.operation as _keras_op
from keras.src.ops.numpy import NotEqual as _NotEqual
_orig_from_config = _keras_op.Operation.from_config.__func__
@classmethod # type: ignore[misc]
def _patched_from_config(cls, config): # noqa: N805
config.pop("quantization_config", None)
return _orig_from_config(cls, config)
_keras_op.Operation.from_config = _patched_from_config
from src.model import BahdanauAttention
_CUSTOM_OBJECTS = {"NotEqual": _NotEqual, "BahdanauAttention": BahdanauAttention}
# ───────────────────────────────────────────────────────────────────
def get_references(captions: dict, image_ids: set) -> dict:
"""
Extract reference captions for evaluation.
Removes start/end tokens and returns as list of word lists.
Returns:
dict mapping image_id → list of reference word lists
"""
references = {}
for img_id in image_ids:
if img_id not in captions:
continue
refs = []
for cap in captions[img_id]:
# Remove start/end tokens and split into words
words = [
w for w in cap.split()
if w not in (START_TOKEN, END_TOKEN)
]
refs.append(words)
references[img_id] = refs
return references
def evaluate_model(backbone: str, captions: dict, features: dict,
tokenizer, test_images: set) -> dict:
"""
Evaluate a trained model on the test set.
Args:
backbone: "vgg16" or "vgg19"
captions: All cleaned captions
features: Pre-extracted image features
tokenizer: Fitted tokenizer
test_images: Set of test image IDs
Returns:
dict with BLEU-1 through BLEU-4 scores
"""
model_file = VGG16_MODEL_FILE if backbone == "vgg16" else VGG19_MODEL_FILE
if not os.path.exists(model_file):
print(f" Warning: Model not found: {model_file}")
return None
print(f"\nLoading {backbone.upper()} model...")
model = load_model(model_file, custom_objects=_CUSTOM_OBJECTS)
# Determine max_length from model
max_length = model.input_shape[1][1]
# Get references
references = get_references(captions, test_images)
# Generate captions for test images
actual_refs = []
hypotheses = []
skipped = 0
print(f"Generating captions for {len(test_images)} test images...")
for img_id in tqdm(sorted(test_images), desc=f"Evaluating {backbone.upper()}"):
if img_id not in features or img_id not in references:
skipped += 1
continue
feature = features[img_id]
# Generate caption using beam search for higher quality
beam_results = beam_search(model, tokenizer, feature, max_length, BEAM_WIDTH)
caption = beam_results[0][0] if beam_results else ""
hypothesis = caption.split()
hypotheses.append(hypothesis)
actual_refs.append(references[img_id])
if skipped > 0:
print(f" Skipped {skipped} images (missing features or references)")
print(f" Evaluated {len(hypotheses)} images")
# ── Compute BLEU scores ──
smooth = SmoothingFunction().method1
bleu_scores = {}
for n in range(1, 5):
weights = tuple([1.0 / n] * n + [0.0] * (4 - n))
score = corpus_bleu(actual_refs, hypotheses, weights=weights,
smoothing_function=smooth)
bleu_scores[f"BLEU-{n}"] = round(score, 4)
print(f" BLEU-{n}: {score:.4f}")
return bleu_scores
def print_comparison_table(results: dict):
"""Print a formatted comparison table of VGG16 vs VGG19."""
print("\n" + "=" * 55)
print(" VGG16 vs VGG19 — BLEU Score Comparison")
print("=" * 55)
print(f"{'Metric':<10} {'VGG16':>10} {'VGG19':>10} {'Diff':>10}")
print("-" * 55)
for metric in ["BLEU-1", "BLEU-2", "BLEU-3", "BLEU-4"]:
v16 = results.get("vgg16", {}).get(metric, "N/A")
v19 = results.get("vgg19", {}).get(metric, "N/A")
if isinstance(v16, float) and isinstance(v19, float):
diff = v19 - v16
sign = "+" if diff >= 0 else ""
print(f"{metric:<10} {v16:>10.4f} {v19:>10.4f} {sign}{diff:>9.4f}")
else:
print(f"{metric:<10} {str(v16):>10} {str(v19):>10} {'—':>10}")
print("=" * 55)
def main():
parser = argparse.ArgumentParser(description="Evaluate CaptionIQ with BLEU scores")
parser.add_argument(
"--backbone", type=str, default="vgg19",
choices=["vgg16", "vgg19", "both"],
help="Which model(s) to evaluate (default: vgg19)"
)
args = parser.parse_args()
import nltk
nltk.download("punkt", quiet=True)
nltk.download("punkt_tab", quiet=True)
print("=" * 60)
print(" CaptionIQ — BLEU Score Evaluation")
print("=" * 60)
# Load shared data
all_captions = load_captions(CAPTIONS_FILE)
tokenizer = load_tokenizer(TOKENIZER_FILE)
test_images = load_image_list(TEST_IMAGES_FILE)
print(f"Test images: {len(test_images)}")
backbones = ["vgg16", "vgg19"] if args.backbone == "both" else [args.backbone]
results = {}
for backbone in backbones:
features_file = VGG16_FEATURES_FILE if backbone == "vgg16" else VGG19_FEATURES_FILE
if not os.path.exists(features_file):
print(f"\n Warning: Features not found: {features_file}")
continue
features = load_features(features_file)
scores = evaluate_model(backbone, all_captions, features, tokenizer, test_images)
if scores:
results[backbone] = scores
# Save results
if results:
with open(BLEU_RESULTS_FILE, "w") as f:
json.dump(results, f, indent=2)
print(f"\nResults saved to: {BLEU_RESULTS_FILE}")
# Print comparison table if both models evaluated
if "vgg16" in results and "vgg19" in results:
print_comparison_table(results)
print("\n✓ Evaluation complete!")
if __name__ == "__main__":
main()