""" CaptionIQ — BLEU Score Evaluation Generate captions for the test set and compute BLEU-1 through BLEU-4 scores. Compare VGG16 vs VGG19 performance. """ import os import sys import json import argparse import numpy as np from tqdm import tqdm from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from src.config import ( CAPTIONS_FILE, TOKENIZER_FILE, VGG16_FEATURES_FILE, VGG19_FEATURES_FILE, VGG16_MODEL_FILE, VGG19_MODEL_FILE, TEST_IMAGES_FILE, BLEU_RESULTS_FILE, START_TOKEN, END_TOKEN, MAX_LENGTH, ) from src.utils import load_captions, load_image_list, load_features, load_tokenizer, get_vocab_size from src.inference import greedy_search, beam_search from src.config import BEAM_WIDTH from tensorflow.keras.models import load_model # ── Keras compatibility patch ────────────────────────────────────── # Models saved with newer Keras include 'quantization_config' in every # layer config. Older Keras versions reject the unknown kwarg. # The Embedding layer's mask_zero=True also serialises a NotEqual op # that the legacy h5 loader can't resolve by name. # Fix: (1) monkey-patch from_config to strip quantization_config, # (2) import NotEqual so we can register it as a custom object. import keras.src.ops.operation as _keras_op from keras.src.ops.numpy import NotEqual as _NotEqual _orig_from_config = _keras_op.Operation.from_config.__func__ @classmethod # type: ignore[misc] def _patched_from_config(cls, config): # noqa: N805 config.pop("quantization_config", None) return _orig_from_config(cls, config) _keras_op.Operation.from_config = _patched_from_config from src.model import BahdanauAttention _CUSTOM_OBJECTS = {"NotEqual": _NotEqual, "BahdanauAttention": BahdanauAttention} # ─────────────────────────────────────────────────────────────────── def get_references(captions: dict, image_ids: set) -> dict: """ Extract reference captions for evaluation. Removes start/end tokens and returns as list of word lists. Returns: dict mapping image_id → list of reference word lists """ references = {} for img_id in image_ids: if img_id not in captions: continue refs = [] for cap in captions[img_id]: # Remove start/end tokens and split into words words = [ w for w in cap.split() if w not in (START_TOKEN, END_TOKEN) ] refs.append(words) references[img_id] = refs return references def evaluate_model(backbone: str, captions: dict, features: dict, tokenizer, test_images: set) -> dict: """ Evaluate a trained model on the test set. Args: backbone: "vgg16" or "vgg19" captions: All cleaned captions features: Pre-extracted image features tokenizer: Fitted tokenizer test_images: Set of test image IDs Returns: dict with BLEU-1 through BLEU-4 scores """ model_file = VGG16_MODEL_FILE if backbone == "vgg16" else VGG19_MODEL_FILE if not os.path.exists(model_file): print(f" Warning: Model not found: {model_file}") return None print(f"\nLoading {backbone.upper()} model...") model = load_model(model_file, custom_objects=_CUSTOM_OBJECTS) # Determine max_length from model max_length = model.input_shape[1][1] # Get references references = get_references(captions, test_images) # Generate captions for test images actual_refs = [] hypotheses = [] skipped = 0 print(f"Generating captions for {len(test_images)} test images...") for img_id in tqdm(sorted(test_images), desc=f"Evaluating {backbone.upper()}"): if img_id not in features or img_id not in references: skipped += 1 continue feature = features[img_id] # Generate caption using beam search for higher quality beam_results = beam_search(model, tokenizer, feature, max_length, BEAM_WIDTH) caption = beam_results[0][0] if beam_results else "" hypothesis = caption.split() hypotheses.append(hypothesis) actual_refs.append(references[img_id]) if skipped > 0: print(f" Skipped {skipped} images (missing features or references)") print(f" Evaluated {len(hypotheses)} images") # ── Compute BLEU scores ── smooth = SmoothingFunction().method1 bleu_scores = {} for n in range(1, 5): weights = tuple([1.0 / n] * n + [0.0] * (4 - n)) score = corpus_bleu(actual_refs, hypotheses, weights=weights, smoothing_function=smooth) bleu_scores[f"BLEU-{n}"] = round(score, 4) print(f" BLEU-{n}: {score:.4f}") return bleu_scores def print_comparison_table(results: dict): """Print a formatted comparison table of VGG16 vs VGG19.""" print("\n" + "=" * 55) print(" VGG16 vs VGG19 — BLEU Score Comparison") print("=" * 55) print(f"{'Metric':<10} {'VGG16':>10} {'VGG19':>10} {'Diff':>10}") print("-" * 55) for metric in ["BLEU-1", "BLEU-2", "BLEU-3", "BLEU-4"]: v16 = results.get("vgg16", {}).get(metric, "N/A") v19 = results.get("vgg19", {}).get(metric, "N/A") if isinstance(v16, float) and isinstance(v19, float): diff = v19 - v16 sign = "+" if diff >= 0 else "" print(f"{metric:<10} {v16:>10.4f} {v19:>10.4f} {sign}{diff:>9.4f}") else: print(f"{metric:<10} {str(v16):>10} {str(v19):>10} {'—':>10}") print("=" * 55) def main(): parser = argparse.ArgumentParser(description="Evaluate CaptionIQ with BLEU scores") parser.add_argument( "--backbone", type=str, default="vgg19", choices=["vgg16", "vgg19", "both"], help="Which model(s) to evaluate (default: vgg19)" ) args = parser.parse_args() import nltk nltk.download("punkt", quiet=True) nltk.download("punkt_tab", quiet=True) print("=" * 60) print(" CaptionIQ — BLEU Score Evaluation") print("=" * 60) # Load shared data all_captions = load_captions(CAPTIONS_FILE) tokenizer = load_tokenizer(TOKENIZER_FILE) test_images = load_image_list(TEST_IMAGES_FILE) print(f"Test images: {len(test_images)}") backbones = ["vgg16", "vgg19"] if args.backbone == "both" else [args.backbone] results = {} for backbone in backbones: features_file = VGG16_FEATURES_FILE if backbone == "vgg16" else VGG19_FEATURES_FILE if not os.path.exists(features_file): print(f"\n Warning: Features not found: {features_file}") continue features = load_features(features_file) scores = evaluate_model(backbone, all_captions, features, tokenizer, test_images) if scores: results[backbone] = scores # Save results if results: with open(BLEU_RESULTS_FILE, "w") as f: json.dump(results, f, indent=2) print(f"\nResults saved to: {BLEU_RESULTS_FILE}") # Print comparison table if both models evaluated if "vgg16" in results and "vgg19" in results: print_comparison_table(results) print("\n✓ Evaluation complete!") if __name__ == "__main__": main()