"""Corpus ROUGE-L for caption evaluation. ROUGE-L measures the longest common subsequence between a prediction and its references and is part of the standard COCO captioning report (BLEU, METEOR, ROUGE-L, CIDEr). Implementation notes: * We use Google's ``rouge_score`` package (the canonical implementation since the original perl scripts were retired). It returns precision / recall / fmeasure per (prediction, reference) pair. * COCO captions ship up to 5 references per image. We take the maximum F-measure across references — same convention as pycocoevalcap. * The corpus score is the mean of per-sample F-measures, matching how sacrebleu and pycocoevalcap aggregate metrics over a dataset. """ from __future__ import annotations from collections.abc import Sequence from captioning.evaluation.tokenization import ( strip_sentinels_many, strip_sentinels_references, ) def corpus_rouge_l_score( predictions: Sequence[str], references: Sequence[Sequence[str]], ) -> float: """Compute corpus ROUGE-L F-measure. Args: predictions: One generated caption per example. references: One *list* of reference captions per example. Returns: Mean ROUGE-L F-measure across examples, in the 0-100 range to match sacrebleu's convention (so the report shows BLEU/ROUGE/METEOR/CIDEr on comparable scales). Raises: ImportError: If ``rouge_score`` is not installed (``pip install -r requirements-eval.txt``). ValueError: On mismatched lengths or an empty references slot. """ if len(predictions) != len(references): raise ValueError( f"predictions ({len(predictions)}) and references " f"({len(references)}) must have the same length" ) if not predictions: return 0.0 try: from rouge_score import rouge_scorer except ImportError as e: raise ImportError( "rouge_score is required for ROUGE-L evaluation. " "Install via `pip install -r requirements-eval.txt`." ) from e scorer = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=True) preds = strip_sentinels_many(predictions) refs = strip_sentinels_references(references) total = 0.0 for hypothesis, ref_list in zip(preds, refs, strict=True): valid_refs = [r for r in ref_list if r] if not valid_refs or not hypothesis: continue best = max(scorer.score(r, hypothesis)["rougeL"].fmeasure for r in valid_refs) total += best return float(100.0 * total / len(preds))