Spaces:

pchandragrid
/

image_captioning

Running

File size: 1,863 Bytes

a745a5e

import os
from typing import Any

from PIL import Image
from pycocoevalcap.cider.cider import Cider
from tqdm import tqdm


def generate_caption(model: Any, processor: Any, image: Image.Image, device) -> str:
    """
    Run the captioning model on a single image and return the decoded caption.
    """
    inputs = processor(images=image, return_tensors="pt").to(device)

    with getattr(__import__("torch"), "no_grad")():
        torch = __import__("torch")
        generated_ids = model.generate(
            **inputs,
            max_length=30,
            num_beams=5,
        )

    caption = processor.decode(
        generated_ids[0],
        skip_special_tokens=True,
    )
    return caption


def evaluate_cider(model: Any, processor: Any, val_dataset, device, max_samples: int = 200) -> float:
    """
    Compute CIDEr score on a validation subset.

    Expects a PyTorch `Subset`/`Dataset` where:
    - `val_dataset.indices[idx]` gives the underlying index
    - `val_dataset.dataset.annotations[...]` is a list of dicts with
      keys `image` and `captions`.
    """
    import torch  # local import to avoid hard dependency for non-training paths

    model.eval()

    cider_scorer = Cider()
    ground_truth = {}
    predictions = {}

    for idx in tqdm(range(min(max_samples, len(val_dataset))), desc="CIDEr Eval"):
        real_idx = val_dataset.indices[idx]
        ann = val_dataset.dataset.annotations[real_idx]

        image_path = os.path.join("train2017", ann["image"])
        image = Image.open(image_path).convert("RGB")

        pred_caption = generate_caption(model, processor, image, device)

        ground_truth[idx] = ann["captions"]
        predictions[idx] = [pred_caption]

    score, _ = cider_scorer.compute_score(ground_truth, predictions)

    print(f"CIDEr Score: {score:.4f}")

    model.train()
    return score