Spaces:

Vara1605454
/

ImageCaptioningProject

Build error

File size: 30,285 Bytes

ee1d4aa

import os
import time
import json
import numpy as np
from tqdm import tqdm
import torch
from torch.utils.data import DataLoader
import math # For perplexity
import random
from .config import EVALUATION_CONFIG, update_config_with_latest_model
from .data_preprocessing import COCOVocabulary

# Import necessary NLTK components for BLEU, METEOR
try:
    import nltk
    from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
    from nltk.translate.meteor_score import meteor_score
    from nltk.tokenize import word_tokenize
    # Suppress NLTK download messages if already downloaded
    nltk.download('punkt', quiet=True)
    nltk.download('wordnet', quiet=True)
except ImportError:
    print("NLTK not installed or data not downloaded. BLEU/METEOR scores will be skipped.")
    print("Please install NLTK (`pip install nltk`) and download data (`python -c \"import nltk; nltk.download('punkt'); nltk.download('wordnet')\"`)")
    corpus_bleu = None
    meteor_score = None
    word_tokenize = None
    SmoothingFunction = None

# Import ROUGE
try:
    from rouge_score import rouge_scorer
except ImportError:
    print("rouge-score not installed. ROUGE-L score will be skipped.")
    print("Please install it: `pip install rouge-score`")
    rouge_scorer = None

# Import pycocotools and pycocoevalcap for CIDEr
try:
    from pycocotools.coco import COCO
    from pycocoevalcap.eval import COCOEvalCap
    import tempfile
except ImportError:
    print("pycocotools or pycocoevalcap not installed. CIDEr score will be skipped.")
    print("Please install: `pip install pycocotools` and `pip install git+https://github.com/salaniz/pycocoevalcap.git`")
    COCO = None
    COCOEvalCap = None
    tempfile = None


from .model import ImageCaptioningModel # Import the model
from .data_preprocessing import COCODataset # Import dataset
from .utils import get_logger, get_eval_transform # Import utilities

logger = get_logger(__name__)


def calculate_bleu_scores_detailed(references, hypotheses):
    """
    Calculates detailed BLEU scores (BLEU-1 to BLEU-4) for a corpus.
    Args:
        references (list of str): List of reference captions. Each reference is a single string.
        hypotheses (list of str): List of hypothesis (generated) captions. Each hypothesis is a single string.
    Returns:
        dict: A dictionary containing BLEU-1, BLEU-2, BLEU-3, BLEU-4 scores.
              Returns zeros if NLTK is not available or an error occurs.
    """
    if corpus_bleu is None or word_tokenize is None or SmoothingFunction is None:
        logger.error("NLTK requirements for BLEU not met. Returning 0 for BLEU scores.")
        return {'BLEU-1': 0, 'BLEU-2': 0, 'BLEU-3': 0, 'BLEU-4': 0}

    try:
        # Each reference is a list of ONE tokenized sentence (as `corpus_bleu` expects multiple references per hypothesis)
        # We assume one reference per image for simplicity.
        ref_tokens = [[word_tokenize(ref.lower())] for ref in references]
        hyp_tokens = [word_tokenize(hyp.lower()) for hyp in hypotheses]

        # Use smoothing function for better BLEU calculation, especially for short sentences or small test sets
        smooth = SmoothingFunction().method1

        # Calculate corpus-level BLEU scores for different n-grams
        bleu_1 = corpus_bleu(ref_tokens, hyp_tokens, weights=(1, 0, 0, 0), smoothing_function=smooth)
        bleu_2 = corpus_bleu(ref_tokens, hyp_tokens, weights=(0.5, 0.5, 0, 0), smoothing_function=smooth)
        bleu_3 = corpus_bleu(ref_tokens, hyp_tokens, weights=(0.33, 0.33, 0.33, 0), smoothing_function=smooth)
        bleu_4 = corpus_bleu(ref_tokens, hyp_tokens, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smooth)

        return {
            'BLEU-1': bleu_1,
            'BLEU-2': bleu_2,
            'BLEU-3': bleu_3,
            'BLEU-4': bleu_4
        }
    except Exception as e:
        logger.error(f"Error calculating BLEU scores: {e}")
        return {'BLEU-1': 0, 'BLEU-2': 0, 'BLEU-3': 0, 'BLEU-4': 0}


def calculate_meteor_score(references, hypotheses):
    """
    Calculates the METEOR score for a corpus.
    Args:
        references (list of str): List of reference captions.
        hypotheses (list of str): List of hypothesis (generated) captions.
    Returns:
        float: Average METEOR score, or None if NLTK/WordNet not available.
    """
    if meteor_score is None or word_tokenize is None:
        logger.error("NLTK requirements for METEOR (wordnet) not met. Returning None for METEOR score.")
        return None

    scores = []
    try:
        for ref, hyp in zip(references, hypotheses):
            ref_tokens = word_tokenize(ref.lower())
            hyp_tokens = word_tokenize(hyp.lower())
            # meteor_score expects a list of reference sentences (even if only one)
            score = meteor_score([ref_tokens], hyp_tokens)
            scores.append(score)

        return np.mean(scores) if scores else 0.0
    except Exception as e:
        logger.error(f"Error calculating METEOR score: {e}")
        return None


def calculate_rouge_l_score(references, hypotheses):
    """
    Calculates the ROUGE-L F-measure score for a corpus.
    Args:
        references (list of str): List of reference captions.
        hypotheses (list of str): List of hypothesis (generated) captions.
    Returns:
        float: Average ROUGE-L score, or None if rouge-score library not available.
    """
    if rouge_scorer is None:
        logger.error("rouge-score library not available. Returning None for ROUGE-L score.")
        return None

    try:
        # Use 'rougeL' for Longest Common Subsequence based ROUGE
        scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
        scores = []

        for ref, hyp in zip(references, hypotheses):
            score = scorer.score(ref, hyp)
            scores.append(score['rougeL'].fmeasure) # We are interested in the F-measure

        return np.mean(scores) if scores else 0.0
    except Exception as e:
        logger.error(f"Error calculating ROUGE-L score: {e}")
        return None


def calculate_cider_score(references, hypotheses):
    """
    Calculates the CIDEr score using pycocoevalcap library.
    Requires pycocotools and pycocoevalcap to be installed.
    Args:
        references (list of str): List of reference captions.
        hypotheses (list of str): List of hypothesis (generated) captions.
    Returns:
        float: CIDEr score, or None if pycocotools/pycocoevalcap not available.
    """
    if COCO is None or COCOEvalCap is None or tempfile is None:
        logger.error("pycocotools or pycocoevalcap not available. Returning None for CIDEr score.")
        return None

    try:
        # pycocoevalcap requires data in a specific COCO format
        # Create dummy image IDs for the COCO objects
        dummy_image_ids = list(range(len(references)))

        # Format references for COCO
        refs_coco_format = []
        for i, ref_str in enumerate(references):
            refs_coco_format.append({"image_id": dummy_image_ids[i], "id": i, "caption": ref_str})

        # Format hypotheses for COCO
        hyps_coco_format = []
        for i, hyp_str in enumerate(hypotheses):
            hyps_coco_format.append({"image_id": dummy_image_ids[i], "id": i, "caption": hyp_str})

        # Save to temporary JSON files as required by COCO/COCOEvalCap
        with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.json') as f_ref:
            # Need to create a minimal COCO-like structure for references
            json.dump({"annotations": refs_coco_format, "images": [{"id": i} for i in dummy_image_ids]}, f_ref)
            ref_path = f_ref.name

        with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.json') as f_hyp:
            json.dump(hyps_coco_format, f_hyp)
            hyp_path = f_hyp.name

        # Initialize COCO and COCOEvalCap objects
        coco = COCO(ref_path)
        cocoRes = coco.loadRes(hyp_path)

        cocoEval = COCOEvalCap(coco, cocoRes)
        cocoEval.params['image_id'] = cocoRes.getImgIds() # Specify images to evaluate
        cocoEval.evaluate() # Perform evaluation

        # Clean up temporary files
        os.remove(ref_path)
        os.remove(hyp_path)

        return cocoEval.eval['CIDEr'] # CIDEr score is directly available
    except Exception as e:
        logger.error(f"Error calculating CIDEr score: {e}")
        return None


def calculate_length_statistics(generated_captions, reference_captions):
    """
    Calculates statistics related to caption lengths.
    Args:
        generated_captions (list of str): List of generated captions.
        reference_captions (list of str): List of reference captions.
    Returns:
        dict: Dictionary containing average, std dev, and difference in lengths.
    """
    gen_lengths = [len(cap.split()) for cap in generated_captions]
    ref_lengths = [len(cap.split()) for cap in reference_captions]

    return {
        'avg_generated_length': np.mean(gen_lengths) if gen_lengths else 0,
        'avg_reference_length': np.mean(ref_lengths) if ref_lengths else 0,
        'length_difference': (np.mean(gen_lengths) - np.mean(ref_lengths)) if gen_lengths and ref_lengths else 0,
        'length_std_generated': np.std(gen_lengths) if gen_lengths else 0,
        'length_std_reference': np.std(ref_lengths) if ref_lengths else 0
    }


def calculate_vocabulary_statistics(generated_captions, vocabulary):
    """
    Calculates vocabulary usage statistics for generated captions.
    Args:
        generated_captions (list of str): List of generated captions.
        vocabulary (COCOVocabulary): The vocabulary object used by the model.
    Returns:
        dict: Dictionary with unique word count, vocabulary coverage, etc.
    """
    all_words = []
    from collections import Counter # Import here to avoid circular dependency issues
    for caption in generated_captions:
        words = caption.lower().split()
        all_words.extend(words)

    unique_words = set(all_words)
    word_freq = Counter(all_words)

    return {
        'unique_words_used': len(unique_words),
        'total_vocabulary_size': vocabulary.vocab_size,
        'vocabulary_coverage': len(unique_words) / vocabulary.vocab_size if vocabulary.vocab_size > 0 else 0,
        'avg_word_frequency': np.mean(list(word_freq.values())) if word_freq else 0,
        'most_common_generated_words': word_freq.most_common(10)
    }


def calculate_diversity_metrics(generated_captions):
    """
    Calculates diversity metrics for generated captions, such as Type-Token Ratio (TTR),
    Self-BLEU, and caption uniqueness.
    Args:
        generated_captions (list of str): List of generated captions.
    Returns:
        dict: Dictionary containing diversity metrics.
    """
    # Type-Token Ratio (TTR)
    all_words = []
    from collections import Counter
    for caption in generated_captions:
        words = caption.lower().split()
        all_words.extend(words)

    unique_words = set(all_words)
    ttr = len(unique_words) / len(all_words) if all_words else 0

    # Self-BLEU (diversity measure) - calculate on a subset for efficiency
    self_bleu = 0
    try:
        if corpus_bleu and word_tokenize and SmoothingFunction:
            smooth = SmoothingFunction().method1
            self_bleu_scores = []

            # Sample a subset of generated captions for Self-BLEU to avoid long computation
            sample_size = min(1000, len(generated_captions))
            sampled_captions = random.sample(generated_captions, sample_size) if len(generated_captions) > sample_size else generated_captions

            for i, caption in enumerate(sampled_captions):
                # References are all other captions in the sample
                references_for_self_bleu = [[word_tokenize(other_cap.lower())]
                                            for j, other_cap in enumerate(sampled_captions) if i != j]
                hypothesis = word_tokenize(caption.lower())

                if references_for_self_bleu and hypothesis: # Ensure there are references and hypothesis tokens
                    # Calculate sentence BLEU with other captions as references
                    score = corpus_bleu(references_for_self_bleu, [hypothesis], smoothing_function=smooth)
                    self_bleu_scores.append(score)

            self_bleu = np.mean(self_bleu_scores) if self_bleu_scores else 0
        else:
            logger.warning("NLTK not fully available for Self-BLEU calculation. Skipping.")
    except Exception as e:
        logger.error(f"Error calculating Self-BLEU: {e}")
        self_bleu = 0

    # Caption uniqueness
    unique_captions = len(set(generated_captions))
    uniqueness_ratio = unique_captions / len(generated_captions) if len(generated_captions) > 0 else 0

    return {
        'type_token_ratio': ttr,
        'self_bleu': self_bleu,
        'unique_captions_count': unique_captions,
        'caption_uniqueness_ratio': uniqueness_ratio
    }


def calculate_perplexity(model, data_loader, vocabulary, device):
    """
    Calculates the perplexity of the model on a given dataset.
    Perplexity measures how well a probability model predicts a sample. Lower is better.
    Args:
        model (nn.Module): The trained image captioning model.
        data_loader (DataLoader): DataLoader for the dataset.
        vocabulary (COCOVocabulary): The vocabulary object.
        device (torch.device): Device to run calculation on.
    Returns:
        float: Perplexity score, or infinity if calculation fails.
    """
    model.eval()
    total_loss = 0
    total_words = 0

    # Use CrossEntropyLoss with sum reduction to get the sum of losses over all tokens
    criterion = torch.nn.CrossEntropyLoss(ignore_index=vocabulary.word2idx['<PAD>'], reduction='sum')

    with torch.no_grad():
        for images, captions_from_loader, caption_lengths_from_loader, _ in tqdm(data_loader, desc="Calculating Perplexity"):
            images = images.to(device)
            captions_for_model = captions_from_loader.to(device)
            caption_lengths_for_model = caption_lengths_from_loader.to(device)

            # Forward pass to get scores
            scores, caps_sorted, decode_lengths, _, _ = model(images, captions_for_model, caption_lengths_for_model)

            # Prepare targets: remove <START> token and slice to match the sequence length of 'scores'.
            # scores are (batch_size, max_decode_len_in_batch, vocab_size)
            # targets should be (batch_size, max_decode_len_in_batch)
            targets = caps_sorted[:, 1:scores.size(1) + 1].contiguous().view(-1) # Flatten targets
            scores_flat = scores.view(-1, scores.size(-1)) # Flatten scores

            loss = criterion(scores_flat, targets) # Calculate loss for all tokens
            total_loss += loss.item()

            # Count non-padded words in the targets that were actually used for loss.
            num_valid_targets_in_batch = targets.ne(vocabulary.word2idx['<PAD>']).sum().item()
            total_words += num_valid_targets_in_batch

    if total_words == 0:
        logger.warning("No valid words found to calculate perplexity (total_words is 0). Returning inf.")
        return float('inf')

    avg_loss_per_word = total_loss / total_words

    # Perplexity is exp(average negative log-likelihood)
    try:
        perplexity = math.exp(avg_loss_per_word)
    except OverflowError: # Handle cases where avg_loss_per_word is very large, leading to overflow
        perplexity = float('inf')

    return perplexity


def print_evaluation_results(metrics):
    """
    Prints the evaluation results in a formatted way.
    Args:
        metrics (dict): Dictionary containing all evaluation metrics.
    """
    logger.info("\n"+"="*60)
    logger.info("                      EVALUATION RESULTS")
    logger.info("="*60)

    # BLEU Scores
    if 'BLEU-1' in metrics:
        logger.info(f"\nBLEU Scores:")
        logger.info(f"  BLEU-1: {metrics['BLEU-1']:.4f}")
        logger.info(f"  BLEU-2: {metrics['BLEU-2']:.4f}")
        logger.info(f"  BLEU-3: {metrics['BLEU-3']:.4f}")
        logger.info(f"  BLEU-4: {metrics['BLEU-4']:.4f}")

    # Other metrics
    if 'METEOR' in metrics and metrics['METEOR'] is not None:
        logger.info(f"\nMETEOR Score: {metrics['METEOR']:.4f}")

    if 'ROUGE-L' in metrics and metrics['ROUGE-L'] is not None:
        logger.info(f"ROUGE-L Score: {metrics['ROUGE-L']:.4f}")

    if 'CIDEr' in metrics and metrics['CIDEr'] is not None:
        logger.info(f"CIDEr Score: {metrics['CIDEr']:.4f}")

    if 'Perplexity' in metrics and metrics['Perplexity'] is not None:
        logger.info(f"Perplexity: {metrics['Perplexity']:.2f}")

    # Length Statistics
    logger.info(f"\nLength Statistics:")
    logger.info(f"  Avg Generated Length: {metrics.get('avg_generated_length', 0):.2f}")
    logger.info(f"  Avg Reference Length: {metrics.get('avg_reference_length', 0):.2f}")
    logger.info(f"  Length Difference (Gen - Ref): {metrics.get('length_difference', 0):.2f}")
    logger.info(f"  Std Dev Generated Length: {metrics.get('length_std_generated', 0):.2f}")
    logger.info(f"  Std Dev Reference Length: {metrics.get('length_std_reference', 0):.2f}")

    # Diversity Metrics
    logger.info(f"\nDiversity Metrics:")
    logger.info(f"  Type-Token Ratio: {metrics.get('type_token_ratio', 0):.4f}")
    logger.info(f"  Caption Uniqueness Ratio: {metrics.get('caption_uniqueness_ratio', 0):.4f}")
    logger.info(f"  Self-BLEU (Higher is lower diversity): {metrics.get('self_bleu', 0):.4f}")
    logger.info(f"  Unique Captions Count: {metrics.get('unique_captions_count', 0)}")

    # Vocabulary Usage
    logger.info(f"\nVocabulary Usage:")
    logger.info(f"  Unique Words Used in Generated Captions: {metrics.get('unique_words_used', 0)}")
    logger.info(f"  Vocabulary Coverage (Used / Total): {metrics.get('vocabulary_coverage', 0):.4f}")
    if 'most_common_generated_words' in metrics:
        logger.info(f"  Most Common Generated Words: {metrics['most_common_generated_words']}")

    logger.info(f"\nEvaluation Info:")
    eval_info = metrics.get('evaluation_info', {})
    logger.info(f"  Total Samples Evaluated: {eval_info.get('total_samples', 0)}")
    logger.info(f"  Evaluation Time: {eval_info.get('evaluation_time_seconds', 0):.2f}s")
    logger.info(f"  Test Data Path: {eval_info.get('test_data_path', 'N/A')}")
    logger.info(f"  Image Directory Used: {eval_info.get('image_dir_used', 'N/A')}")
    logger.info(f"  Device: {eval_info.get('device', 'unknown')}")
    logger.info(f"  Model Architecture: {eval_info.get('model_architecture', 'N/A')}")

    logger.info("="*60)


def save_evaluation_results(metrics, generated_captions, reference_captions, image_ids, output_dir='evaluation_results'):
    """
    Saves detailed evaluation results to JSON files.
    Args:
        metrics (dict): Dictionary containing all evaluation metrics.
        generated_captions (list of str): List of generated captions.
        reference_captions (list of str): List of reference captions.
        image_ids (list): List of original image IDs corresponding to captions.
        output_dir (str): Directory to save the results.
    """
    os.makedirs(output_dir, exist_ok=True) # Ensure output directory exists

    # Save metrics
    metrics_path = os.path.join(output_dir, 'metrics.json')
    # Convert numpy types to Python types for JSON serialization
    serializable_metrics = {}
    for key, value in metrics.items():
        if isinstance(value, (np.float32, np.float64)):
            serializable_metrics[key] = float(value)
        elif isinstance(value, (np.int32, np.int64)):
            serializable_metrics[key] = int(value)
        else:
            serializable_metrics[key] = value

    with open(metrics_path, 'w') as f:
        json.dump(serializable_metrics, f, indent=2)

    # Save generated captions and their references along with image_ids
    captions_data = []
    for img_id, gen_cap, ref_cap in zip(image_ids, generated_captions, reference_captions):
        captions_data.append({
            'image_id': img_id,
            'generated_caption': gen_cap,
            'reference_caption': ref_cap
        })

    captions_path = os.path.join(output_dir, 'captions.json')
    with open(captions_path, 'w') as f:
        json.dump(captions_data, f, indent=2)

    logger.info(f"\nDetailed evaluation results saved to: {output_dir}/")
    logger.info(f"Metrics saved to: {metrics_path}")
    logger.info(f"Captions saved to: {captions_path}")


def perform_evaluation(model, vocabulary, test_config):
    """
    Performs comprehensive evaluation of the image captioning model on a test set.

    Args:
        model (nn.Module): The trained image captioning model.
        vocabulary (COCOVocabulary): The vocabulary object used by the model.
        test_config (dict): Configuration dictionary for evaluation.

    Returns:
        dict: Dictionary containing all evaluation metrics.
    """
    logger.info("Starting comprehensive model evaluation...")
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    model.eval() # Set model to evaluation mode
    logger.info(f"Model set to evaluation mode on device: {device}")

    # Data paths for evaluation from config
    data_folder = test_config['data_folder']
    test_image_folder = test_config['test_image_folder']
    test_caption_file = test_config['test_caption_file']

    if not os.path.exists(test_caption_file):
        raise FileNotFoundError(f"Test caption file not found: {test_caption_file}")

    # Construct the correct image directory path for evaluation
    image_dir_for_eval = os.path.join(data_folder, test_image_folder)
    if not os.path.exists(image_dir_for_eval):
        logger.error(f"Image directory for evaluation not found: {image_dir_for_eval}")
        logger.error("Please ensure COCO images are extracted to the correct path.")
        return {'error': f'Image directory not found: {image_dir_for_eval}'}

    logger.info(f"Attempting to load evaluation images from directory: {image_dir_for_eval}")

    # Create test dataset
    test_dataset = COCODataset(
        image_dir=image_dir_for_eval,
        caption_file=test_caption_file,
        vocabulary=vocabulary, # Use the vocabulary from training
        max_caption_length=test_config.get('max_caption_length', 20),
        subset_size=test_config.get('test_subset_size'),
        transform=get_eval_transform() # Use standard eval transform
    )

    test_loader = DataLoader(
        test_dataset,
        batch_size=test_config.get('eval_batch_size', 1), # Batch size 1 is crucial for beam search
        shuffle=False, # Do not shuffle test data
        num_workers=test_config.get('num_workers', 2),
        pin_memory=True
    )

    logger.info(f"Test dataset size: {len(test_dataset)}")

    if len(test_dataset) == 0:
        logger.warning("Test dataset is empty. Evaluation will not produce meaningful results.")
        return {'error': 'Test dataset is empty', 'image_dir_checked': image_dir_for_eval}

    # Generate captions for all test images
    logger.info("Generating captions for evaluation set...")
    generated_captions_list = []
    reference_captions_list = []
    image_ids_list = [] # To store original image IDs for mapping

    eval_start_time = time.time()

    with torch.no_grad(): # Disable gradient calculations
        for i, (images, caption_indices_batch, _, original_image_ids_batch) in enumerate(tqdm(test_loader, desc="Generating captions")):
            images = images.to(device)

            for j in range(images.size(0)): # Iterate through batch (should be size 1 if eval_batch_size=1)
                image_tensor_single = images[j] # Get single image tensor from batch

                # Generate caption using the model's beam search method
                generated_caption = model.generate_caption(
                    image_tensor_single, vocabulary, device,
                    beam_size=test_config.get('beam_size', 5),
                    max_length=test_config.get('max_caption_length', 20)
                )

                # Convert reference caption indices back to string
                reference_caption_str = vocabulary.indices_to_caption(caption_indices_batch[j].cpu().numpy())

                generated_captions_list.append(generated_caption)
                reference_captions_list.append(reference_caption_str)
                # Ensure image_id is a string or compatible type for JSON serialization
                image_ids_list.append(str(original_image_ids_batch[j].item()))

    eval_time = time.time() - eval_start_time
    logger.info(f"Caption generation completed in {eval_time:.2f} seconds for {len(generated_captions_list)} images.")

    if not generated_captions_list or not reference_captions_list:
        logger.error("No captions were generated or no reference captions were loaded. Cannot calculate metrics.")
        return {'error': 'No generated or reference captions available for metric calculation.'}

    # Calculate evaluation metrics
    logger.info("Calculating evaluation metrics...")
    metrics = {}

    # Calculate standard metrics
    bleu_scores = calculate_bleu_scores_detailed(reference_captions_list, generated_captions_list)
    metrics.update(bleu_scores)

    meteor_score_val = calculate_meteor_score(reference_captions_list, generated_captions_list)
    if meteor_score_val is not None:
        metrics['METEOR'] = meteor_score_val

    rouge_score_val = calculate_rouge_l_score(reference_captions_list, generated_captions_list)
    if rouge_score_val is not None:
        metrics['ROUGE-L'] = rouge_score_val

    cider_score_val = calculate_cider_score(reference_captions_list, generated_captions_list)
    if cider_score_val is not None:
        metrics['CIDEr'] = cider_score_val

    # Calculate length and diversity statistics
    length_stats = calculate_length_statistics(generated_captions_list, reference_captions_list)
    metrics.update(length_stats)

    vocab_stats = calculate_vocabulary_statistics(generated_captions_list, vocabulary)
    metrics.update(vocab_stats)

    diversity_stats = calculate_diversity_metrics(generated_captions_list)
    metrics.update(diversity_stats)

    # Calculate perplexity
    try:
        perplexity = calculate_perplexity(model, test_loader, vocabulary, device)
        metrics['Perplexity'] = perplexity
    except Exception as e:
        logger.error(f"Could not calculate perplexity: {e}")

    # Add meta information about the evaluation run
    metrics['evaluation_info'] = {
        'total_samples': len(generated_captions_list),
        'evaluation_time_seconds': eval_time,
        'test_data_path': test_caption_file,
        'image_dir_used': image_dir_for_eval,
        'device': str(device),
        'model_architecture': 'ResNet50 Encoder + LSTM Decoder with Attention',
        'beam_size_for_inference': test_config.get('beam_size', 5),
        'max_caption_length_for_inference': test_config.get('max_caption_length', 20)
    }

    # Print and save results
    print_evaluation_results(metrics)
    save_evaluation_results(metrics, generated_captions_list, reference_captions_list, image_ids_list, output_dir=test_config.get('eval_output_dir', 'output/evaluation_results'))

    return metrics


if __name__ == '__main__':
    # When `evaluation.py` is run directly, it will perform evaluation.
    from .config import EVALUATION_CONFIG, update_config_with_latest_model
    import pickle # For loading vocabulary

    logger.info("Starting model evaluation process...")

    # Load the vocabulary first
    VOCABULARY_FILE_PATH = 'output/vocabulary.pkl' # Path to the vocabulary file
    if not os.path.exists(VOCABULARY_FILE_PATH):
        logger.error(f"Vocabulary not found at {VOCABULARY_FILE_PATH}. Please train the model first or provide a pre-trained vocabulary.")
        exit() # Exit if vocabulary is not found
    try:
        with open(VOCABULARY_FILE_PATH, 'rb') as f:
            vocabulary = pickle.load(f)
        logger.info(f"Loaded vocabulary from {VOCABULARY_FILE_PATH}")
    except Exception as e:
        logger.error(f"Error loading vocabulary from {VOCABULARY_FILE_PATH}: {e}")
        exit()

    # Update evaluation config to point to the latest trained model
    update_config_with_latest_model(EVALUATION_CONFIG)
    model_path = EVALUATION_CONFIG.get('model_path')

    if not model_path or not os.path.exists(model_path):
        logger.error(f"Model checkpoint not found at {model_path}. Please train the model or specify a valid model_path in config.py.")
        exit()

    try:
        # Load the model state dict and config from the checkpoint
        checkpoint = torch.load(model_path, map_location='cpu')
        model_config_from_checkpoint = checkpoint.get('config', {})

        # Initialize model with parameters from checkpoint config (or defaults if missing)
        eval_model = ImageCaptioningModel(
            vocab_size=vocabulary.vocab_size, # Use the loaded vocabulary's size
            embed_dim=model_config_from_checkpoint.get('embed_dim', 256),
            attention_dim=model_config_from_checkpoint.get('attention_dim', 256),
            decoder_dim=model_config_from_checkpoint.get('decoder_dim', 256),
            dropout=0.0, # No dropout during evaluation
            fine_tune_encoder=False, # No fine-tuning during evaluation
            max_caption_length=model_config_from_checkpoint.get('max_caption_length', 20)
        )
        eval_model.load_state_dict(checkpoint['model_state_dict'])
        logger.info(f"Model loaded successfully from {model_path} for evaluation.")

        # Perform the comprehensive evaluation
        eval_metrics = perform_evaluation(eval_model, vocabulary, EVALUATION_CONFIG)
        logger.info("Model Evaluation Complete!")

    except FileNotFoundError as e:
        logger.error(f"Error during evaluation setup: {e}")
        logger.error("Please ensure the model path and data paths are correct.")
    except Exception as e:
        logger.critical(f"An unexpected error occurred during evaluation: {e}", exc_info=True)