Spaces:

apoorvrajdev
/

image-captioning-api

Configuration error

App Files Files Community

image-captioning-api / src /captioning /evaluation /cider.py

apoorvrajdev

feat(evaluation): add beam search, metrics pipeline, and stabilized training workflow

91a1214 21 days ago

raw

history blame contribute delete

3 kB

	"""CIDEr (Consensus-based Image Description Evaluation) corpus metric.

	CIDEr is the metric the COCO captioning leaderboard ranks by. It computes a
	TF-IDF weighting over n-grams of the references and measures cosine similarity
	to the prediction. Higher is better; correctly trained models score in the
	range 0.6 - 1.4 on COCO val.

	Implementation notes:
	* We delegate to ``pycocoevalcap`` — the reference implementation used by
	the original CIDEr paper and by every COCO submission.
	* CIDEr's TF-IDF is corpus-level: scoring a single example returns 0
	because every n-gram is "common" to that one-document corpus. The
	``runner`` aggregator and the CLI guard against calling CIDEr with
	fewer than ``MIN_SAMPLES_FOR_CIDER`` examples and return ``None``
	in that case.
	"""

	from __future__ import annotations

	from collections.abc import Sequence

	from captioning.evaluation.tokenization import (
	strip_sentinels_many,
	strip_sentinels_references,
	)

	# CIDEr's TF-IDF is degenerate below this — every n-gram is "common"
	# to the entire corpus, so the score collapses to 0. We surface ``None``
	# instead of a misleading value below this threshold.
	MIN_SAMPLES_FOR_CIDER = 2


	def corpus_cider_score(
	predictions: Sequence[str],
	references: Sequence[Sequence[str]],
	) -> float:
	"""Compute corpus CIDEr.

	Args:
	predictions: One generated caption per example.
	references: One list of reference captions per example.

	Returns:
	CIDEr in the 0-10 range (pycocoevalcap convention; the typical COCO
	leaderboard value is in [0, 2]).

	Raises:
	ImportError: If ``pycocoevalcap`` is not installed.
	ValueError: On mismatched lengths or if called with fewer than
	``MIN_SAMPLES_FOR_CIDER`` examples (in which case CIDEr's TF-IDF
	is degenerate and the score is meaningless).
	"""
	if len(predictions) != len(references):
	raise ValueError(
	f"predictions ({len(predictions)}) and references "
	f"({len(references)}) must have the same length"
	)
	if len(predictions) < MIN_SAMPLES_FOR_CIDER:
	raise ValueError(
	f"CIDEr requires at least {MIN_SAMPLES_FOR_CIDER} examples; "
	f"got {len(predictions)}. TF-IDF is degenerate on smaller corpora."
	)

	try:
	from pycocoevalcap.cider.cider import Cider
	except ImportError as e:
	raise ImportError(
	"pycocoevalcap is required for CIDEr evaluation. "
	"Install via `pip install -r requirements-eval.txt`."
	) from e

	preds = strip_sentinels_many(predictions)
	refs = strip_sentinels_references(references)

	# pycocoevalcap expects {image_id: [captions]} dicts.
	gts = {str(i): [r for r in ref_list if r] for i, ref_list in enumerate(refs)}
	res = {str(i): [p] for i, p in enumerate(preds)}

	scorer = Cider()
	score, _ = scorer.compute_score(gts, res)
	return float(score)