pchandragrid's picture
Deploy Streamlit app
a745a5e
import os
from typing import Any
from PIL import Image
from pycocoevalcap.cider.cider import Cider
from tqdm import tqdm
def generate_caption(model: Any, processor: Any, image: Image.Image, device) -> str:
"""
Run the captioning model on a single image and return the decoded caption.
"""
inputs = processor(images=image, return_tensors="pt").to(device)
with getattr(__import__("torch"), "no_grad")():
torch = __import__("torch")
generated_ids = model.generate(
**inputs,
max_length=30,
num_beams=5,
)
caption = processor.decode(
generated_ids[0],
skip_special_tokens=True,
)
return caption
def evaluate_cider(model: Any, processor: Any, val_dataset, device, max_samples: int = 200) -> float:
"""
Compute CIDEr score on a validation subset.
Expects a PyTorch `Subset`/`Dataset` where:
- `val_dataset.indices[idx]` gives the underlying index
- `val_dataset.dataset.annotations[...]` is a list of dicts with
keys `image` and `captions`.
"""
import torch # local import to avoid hard dependency for non-training paths
model.eval()
cider_scorer = Cider()
ground_truth = {}
predictions = {}
for idx in tqdm(range(min(max_samples, len(val_dataset))), desc="CIDEr Eval"):
real_idx = val_dataset.indices[idx]
ann = val_dataset.dataset.annotations[real_idx]
image_path = os.path.join("train2017", ann["image"])
image = Image.open(image_path).convert("RGB")
pred_caption = generate_caption(model, processor, image, device)
ground_truth[idx] = ann["captions"]
predictions[idx] = [pred_caption]
score, _ = cider_scorer.compute_score(ground_truth, predictions)
print(f"CIDEr Score: {score:.4f}")
model.train()
return score