Spaces:
Sleeping
Sleeping
File size: 7,579 Bytes
290f366 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 | """
CaptionIQ β BLEU Score Evaluation
Generate captions for the test set and compute BLEU-1 through BLEU-4 scores.
Compare VGG16 vs VGG19 performance.
"""
import os
import sys
import json
import argparse
import numpy as np
from tqdm import tqdm
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from src.config import (
CAPTIONS_FILE, TOKENIZER_FILE,
VGG16_FEATURES_FILE, VGG19_FEATURES_FILE,
VGG16_MODEL_FILE, VGG19_MODEL_FILE,
TEST_IMAGES_FILE, BLEU_RESULTS_FILE,
START_TOKEN, END_TOKEN, MAX_LENGTH,
)
from src.utils import load_captions, load_image_list, load_features, load_tokenizer, get_vocab_size
from src.inference import greedy_search, beam_search
from src.config import BEAM_WIDTH
from tensorflow.keras.models import load_model
# ββ Keras compatibility patch ββββββββββββββββββββββββββββββββββββββ
# Models saved with newer Keras include 'quantization_config' in every
# layer config. Older Keras versions reject the unknown kwarg.
# The Embedding layer's mask_zero=True also serialises a NotEqual op
# that the legacy h5 loader can't resolve by name.
# Fix: (1) monkey-patch from_config to strip quantization_config,
# (2) import NotEqual so we can register it as a custom object.
import keras.src.ops.operation as _keras_op
from keras.src.ops.numpy import NotEqual as _NotEqual
_orig_from_config = _keras_op.Operation.from_config.__func__
@classmethod # type: ignore[misc]
def _patched_from_config(cls, config): # noqa: N805
config.pop("quantization_config", None)
return _orig_from_config(cls, config)
_keras_op.Operation.from_config = _patched_from_config
from src.model import BahdanauAttention
_CUSTOM_OBJECTS = {"NotEqual": _NotEqual, "BahdanauAttention": BahdanauAttention}
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def get_references(captions: dict, image_ids: set) -> dict:
"""
Extract reference captions for evaluation.
Removes start/end tokens and returns as list of word lists.
Returns:
dict mapping image_id β list of reference word lists
"""
references = {}
for img_id in image_ids:
if img_id not in captions:
continue
refs = []
for cap in captions[img_id]:
# Remove start/end tokens and split into words
words = [
w for w in cap.split()
if w not in (START_TOKEN, END_TOKEN)
]
refs.append(words)
references[img_id] = refs
return references
def evaluate_model(backbone: str, captions: dict, features: dict,
tokenizer, test_images: set) -> dict:
"""
Evaluate a trained model on the test set.
Args:
backbone: "vgg16" or "vgg19"
captions: All cleaned captions
features: Pre-extracted image features
tokenizer: Fitted tokenizer
test_images: Set of test image IDs
Returns:
dict with BLEU-1 through BLEU-4 scores
"""
model_file = VGG16_MODEL_FILE if backbone == "vgg16" else VGG19_MODEL_FILE
if not os.path.exists(model_file):
print(f" Warning: Model not found: {model_file}")
return None
print(f"\nLoading {backbone.upper()} model...")
model = load_model(model_file, custom_objects=_CUSTOM_OBJECTS)
# Determine max_length from model
max_length = model.input_shape[1][1]
# Get references
references = get_references(captions, test_images)
# Generate captions for test images
actual_refs = []
hypotheses = []
skipped = 0
print(f"Generating captions for {len(test_images)} test images...")
for img_id in tqdm(sorted(test_images), desc=f"Evaluating {backbone.upper()}"):
if img_id not in features or img_id not in references:
skipped += 1
continue
feature = features[img_id]
# Generate caption using beam search for higher quality
beam_results = beam_search(model, tokenizer, feature, max_length, BEAM_WIDTH)
caption = beam_results[0][0] if beam_results else ""
hypothesis = caption.split()
hypotheses.append(hypothesis)
actual_refs.append(references[img_id])
if skipped > 0:
print(f" Skipped {skipped} images (missing features or references)")
print(f" Evaluated {len(hypotheses)} images")
# ββ Compute BLEU scores ββ
smooth = SmoothingFunction().method1
bleu_scores = {}
for n in range(1, 5):
weights = tuple([1.0 / n] * n + [0.0] * (4 - n))
score = corpus_bleu(actual_refs, hypotheses, weights=weights,
smoothing_function=smooth)
bleu_scores[f"BLEU-{n}"] = round(score, 4)
print(f" BLEU-{n}: {score:.4f}")
return bleu_scores
def print_comparison_table(results: dict):
"""Print a formatted comparison table of VGG16 vs VGG19."""
print("\n" + "=" * 55)
print(" VGG16 vs VGG19 β BLEU Score Comparison")
print("=" * 55)
print(f"{'Metric':<10} {'VGG16':>10} {'VGG19':>10} {'Diff':>10}")
print("-" * 55)
for metric in ["BLEU-1", "BLEU-2", "BLEU-3", "BLEU-4"]:
v16 = results.get("vgg16", {}).get(metric, "N/A")
v19 = results.get("vgg19", {}).get(metric, "N/A")
if isinstance(v16, float) and isinstance(v19, float):
diff = v19 - v16
sign = "+" if diff >= 0 else ""
print(f"{metric:<10} {v16:>10.4f} {v19:>10.4f} {sign}{diff:>9.4f}")
else:
print(f"{metric:<10} {str(v16):>10} {str(v19):>10} {'β':>10}")
print("=" * 55)
def main():
parser = argparse.ArgumentParser(description="Evaluate CaptionIQ with BLEU scores")
parser.add_argument(
"--backbone", type=str, default="vgg19",
choices=["vgg16", "vgg19", "both"],
help="Which model(s) to evaluate (default: vgg19)"
)
args = parser.parse_args()
import nltk
nltk.download("punkt", quiet=True)
nltk.download("punkt_tab", quiet=True)
print("=" * 60)
print(" CaptionIQ β BLEU Score Evaluation")
print("=" * 60)
# Load shared data
all_captions = load_captions(CAPTIONS_FILE)
tokenizer = load_tokenizer(TOKENIZER_FILE)
test_images = load_image_list(TEST_IMAGES_FILE)
print(f"Test images: {len(test_images)}")
backbones = ["vgg16", "vgg19"] if args.backbone == "both" else [args.backbone]
results = {}
for backbone in backbones:
features_file = VGG16_FEATURES_FILE if backbone == "vgg16" else VGG19_FEATURES_FILE
if not os.path.exists(features_file):
print(f"\n Warning: Features not found: {features_file}")
continue
features = load_features(features_file)
scores = evaluate_model(backbone, all_captions, features, tokenizer, test_images)
if scores:
results[backbone] = scores
# Save results
if results:
with open(BLEU_RESULTS_FILE, "w") as f:
json.dump(results, f, indent=2)
print(f"\nResults saved to: {BLEU_RESULTS_FILE}")
# Print comparison table if both models evaluated
if "vgg16" in results and "vgg19" in results:
print_comparison_table(results)
print("\nβ Evaluation complete!")
if __name__ == "__main__":
main()
|