Spaces:
Sleeping
Sleeping
| """ | |
| CaptionIQ — BLEU Score Evaluation | |
| Generate captions for the test set and compute BLEU-1 through BLEU-4 scores. | |
| Compare VGG16 vs VGG19 performance. | |
| """ | |
| import os | |
| import sys | |
| import json | |
| import argparse | |
| import numpy as np | |
| from tqdm import tqdm | |
| from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction | |
| sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) | |
| from src.config import ( | |
| CAPTIONS_FILE, TOKENIZER_FILE, | |
| VGG16_FEATURES_FILE, VGG19_FEATURES_FILE, | |
| VGG16_MODEL_FILE, VGG19_MODEL_FILE, | |
| TEST_IMAGES_FILE, BLEU_RESULTS_FILE, | |
| START_TOKEN, END_TOKEN, MAX_LENGTH, | |
| ) | |
| from src.utils import load_captions, load_image_list, load_features, load_tokenizer, get_vocab_size | |
| from src.inference import greedy_search, beam_search | |
| from src.config import BEAM_WIDTH | |
| from tensorflow.keras.models import load_model | |
| # ── Keras compatibility patch ────────────────────────────────────── | |
| # Models saved with newer Keras include 'quantization_config' in every | |
| # layer config. Older Keras versions reject the unknown kwarg. | |
| # The Embedding layer's mask_zero=True also serialises a NotEqual op | |
| # that the legacy h5 loader can't resolve by name. | |
| # Fix: (1) monkey-patch from_config to strip quantization_config, | |
| # (2) import NotEqual so we can register it as a custom object. | |
| import keras.src.ops.operation as _keras_op | |
| from keras.src.ops.numpy import NotEqual as _NotEqual | |
| _orig_from_config = _keras_op.Operation.from_config.__func__ | |
| # type: ignore[misc] | |
| def _patched_from_config(cls, config): # noqa: N805 | |
| config.pop("quantization_config", None) | |
| return _orig_from_config(cls, config) | |
| _keras_op.Operation.from_config = _patched_from_config | |
| from src.model import BahdanauAttention | |
| _CUSTOM_OBJECTS = {"NotEqual": _NotEqual, "BahdanauAttention": BahdanauAttention} | |
| # ─────────────────────────────────────────────────────────────────── | |
| def get_references(captions: dict, image_ids: set) -> dict: | |
| """ | |
| Extract reference captions for evaluation. | |
| Removes start/end tokens and returns as list of word lists. | |
| Returns: | |
| dict mapping image_id → list of reference word lists | |
| """ | |
| references = {} | |
| for img_id in image_ids: | |
| if img_id not in captions: | |
| continue | |
| refs = [] | |
| for cap in captions[img_id]: | |
| # Remove start/end tokens and split into words | |
| words = [ | |
| w for w in cap.split() | |
| if w not in (START_TOKEN, END_TOKEN) | |
| ] | |
| refs.append(words) | |
| references[img_id] = refs | |
| return references | |
| def evaluate_model(backbone: str, captions: dict, features: dict, | |
| tokenizer, test_images: set) -> dict: | |
| """ | |
| Evaluate a trained model on the test set. | |
| Args: | |
| backbone: "vgg16" or "vgg19" | |
| captions: All cleaned captions | |
| features: Pre-extracted image features | |
| tokenizer: Fitted tokenizer | |
| test_images: Set of test image IDs | |
| Returns: | |
| dict with BLEU-1 through BLEU-4 scores | |
| """ | |
| model_file = VGG16_MODEL_FILE if backbone == "vgg16" else VGG19_MODEL_FILE | |
| if not os.path.exists(model_file): | |
| print(f" Warning: Model not found: {model_file}") | |
| return None | |
| print(f"\nLoading {backbone.upper()} model...") | |
| model = load_model(model_file, custom_objects=_CUSTOM_OBJECTS) | |
| # Determine max_length from model | |
| max_length = model.input_shape[1][1] | |
| # Get references | |
| references = get_references(captions, test_images) | |
| # Generate captions for test images | |
| actual_refs = [] | |
| hypotheses = [] | |
| skipped = 0 | |
| print(f"Generating captions for {len(test_images)} test images...") | |
| for img_id in tqdm(sorted(test_images), desc=f"Evaluating {backbone.upper()}"): | |
| if img_id not in features or img_id not in references: | |
| skipped += 1 | |
| continue | |
| feature = features[img_id] | |
| # Generate caption using beam search for higher quality | |
| beam_results = beam_search(model, tokenizer, feature, max_length, BEAM_WIDTH) | |
| caption = beam_results[0][0] if beam_results else "" | |
| hypothesis = caption.split() | |
| hypotheses.append(hypothesis) | |
| actual_refs.append(references[img_id]) | |
| if skipped > 0: | |
| print(f" Skipped {skipped} images (missing features or references)") | |
| print(f" Evaluated {len(hypotheses)} images") | |
| # ── Compute BLEU scores ── | |
| smooth = SmoothingFunction().method1 | |
| bleu_scores = {} | |
| for n in range(1, 5): | |
| weights = tuple([1.0 / n] * n + [0.0] * (4 - n)) | |
| score = corpus_bleu(actual_refs, hypotheses, weights=weights, | |
| smoothing_function=smooth) | |
| bleu_scores[f"BLEU-{n}"] = round(score, 4) | |
| print(f" BLEU-{n}: {score:.4f}") | |
| return bleu_scores | |
| def print_comparison_table(results: dict): | |
| """Print a formatted comparison table of VGG16 vs VGG19.""" | |
| print("\n" + "=" * 55) | |
| print(" VGG16 vs VGG19 — BLEU Score Comparison") | |
| print("=" * 55) | |
| print(f"{'Metric':<10} {'VGG16':>10} {'VGG19':>10} {'Diff':>10}") | |
| print("-" * 55) | |
| for metric in ["BLEU-1", "BLEU-2", "BLEU-3", "BLEU-4"]: | |
| v16 = results.get("vgg16", {}).get(metric, "N/A") | |
| v19 = results.get("vgg19", {}).get(metric, "N/A") | |
| if isinstance(v16, float) and isinstance(v19, float): | |
| diff = v19 - v16 | |
| sign = "+" if diff >= 0 else "" | |
| print(f"{metric:<10} {v16:>10.4f} {v19:>10.4f} {sign}{diff:>9.4f}") | |
| else: | |
| print(f"{metric:<10} {str(v16):>10} {str(v19):>10} {'—':>10}") | |
| print("=" * 55) | |
| def main(): | |
| parser = argparse.ArgumentParser(description="Evaluate CaptionIQ with BLEU scores") | |
| parser.add_argument( | |
| "--backbone", type=str, default="vgg19", | |
| choices=["vgg16", "vgg19", "both"], | |
| help="Which model(s) to evaluate (default: vgg19)" | |
| ) | |
| args = parser.parse_args() | |
| import nltk | |
| nltk.download("punkt", quiet=True) | |
| nltk.download("punkt_tab", quiet=True) | |
| print("=" * 60) | |
| print(" CaptionIQ — BLEU Score Evaluation") | |
| print("=" * 60) | |
| # Load shared data | |
| all_captions = load_captions(CAPTIONS_FILE) | |
| tokenizer = load_tokenizer(TOKENIZER_FILE) | |
| test_images = load_image_list(TEST_IMAGES_FILE) | |
| print(f"Test images: {len(test_images)}") | |
| backbones = ["vgg16", "vgg19"] if args.backbone == "both" else [args.backbone] | |
| results = {} | |
| for backbone in backbones: | |
| features_file = VGG16_FEATURES_FILE if backbone == "vgg16" else VGG19_FEATURES_FILE | |
| if not os.path.exists(features_file): | |
| print(f"\n Warning: Features not found: {features_file}") | |
| continue | |
| features = load_features(features_file) | |
| scores = evaluate_model(backbone, all_captions, features, tokenizer, test_images) | |
| if scores: | |
| results[backbone] = scores | |
| # Save results | |
| if results: | |
| with open(BLEU_RESULTS_FILE, "w") as f: | |
| json.dump(results, f, indent=2) | |
| print(f"\nResults saved to: {BLEU_RESULTS_FILE}") | |
| # Print comparison table if both models evaluated | |
| if "vgg16" in results and "vgg19" in results: | |
| print_comparison_table(results) | |
| print("\n✓ Evaluation complete!") | |
| if __name__ == "__main__": | |
| main() | |