File size: 2,632 Bytes
91a1214
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
"""Corpus ROUGE-L for caption evaluation.

ROUGE-L measures the longest common subsequence between a prediction and its
references and is part of the standard COCO captioning report (BLEU, METEOR,
ROUGE-L, CIDEr).

Implementation notes:
    * We use Google's ``rouge_score`` package (the canonical implementation
      since the original perl scripts were retired). It returns precision /
      recall / fmeasure per (prediction, reference) pair.
    * COCO captions ship up to 5 references per image. We take the maximum
      F-measure across references — same convention as pycocoevalcap.
    * The corpus score is the mean of per-sample F-measures, matching how
      sacrebleu and pycocoevalcap aggregate metrics over a dataset.
"""

from __future__ import annotations

from collections.abc import Sequence

from captioning.evaluation.tokenization import (
    strip_sentinels_many,
    strip_sentinels_references,
)


def corpus_rouge_l_score(
    predictions: Sequence[str],
    references: Sequence[Sequence[str]],
) -> float:
    """Compute corpus ROUGE-L F-measure.

    Args:
        predictions: One generated caption per example.
        references: One *list* of reference captions per example.

    Returns:
        Mean ROUGE-L F-measure across examples, in the 0-100 range to match
        sacrebleu's convention (so the report shows BLEU/ROUGE/METEOR/CIDEr
        on comparable scales).

    Raises:
        ImportError: If ``rouge_score`` is not installed
            (``pip install -r requirements-eval.txt``).
        ValueError: On mismatched lengths or an empty references slot.
    """
    if len(predictions) != len(references):
        raise ValueError(
            f"predictions ({len(predictions)}) and references "
            f"({len(references)}) must have the same length"
        )
    if not predictions:
        return 0.0

    try:
        from rouge_score import rouge_scorer
    except ImportError as e:
        raise ImportError(
            "rouge_score is required for ROUGE-L evaluation. "
            "Install via `pip install -r requirements-eval.txt`."
        ) from e

    scorer = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=True)
    preds = strip_sentinels_many(predictions)
    refs = strip_sentinels_references(references)

    total = 0.0
    for hypothesis, ref_list in zip(preds, refs, strict=True):
        valid_refs = [r for r in ref_list if r]
        if not valid_refs or not hypothesis:
            continue
        best = max(scorer.score(r, hypothesis)["rougeL"].fmeasure for r in valid_refs)
        total += best

    return float(100.0 * total / len(preds))