File size: 2,724 Bytes
91a1214
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
"""METEOR (Metric for Evaluation of Translation with Explicit Ordering).

METEOR is part of the standard COCO captioning report alongside BLEU, ROUGE-L,
and CIDEr. It complements BLEU by rewarding semantic matches (synonyms,
stems) rather than only surface n-gram overlap.

Implementation notes:
    * We use the ``pycocoevalcap`` METEOR adapter, which shells out to the
      original Java implementation. METEOR therefore needs a JRE on PATH at
      runtime; the import succeeds either way, the Java process is spawned
      lazily on first scoring call.
    * METEOR's process is long-lived and accepts batches over stdin/stdout —
      a single ``compute_score`` call handles the whole corpus in one round
      trip, so this scales to thousands of examples without thrashing the JVM.
"""

from __future__ import annotations

from collections.abc import Sequence

from captioning.evaluation.tokenization import (
    strip_sentinels_many,
    strip_sentinels_references,
)


def corpus_meteor_score(
    predictions: Sequence[str],
    references: Sequence[Sequence[str]],
) -> float:
    """Compute corpus METEOR via ``pycocoevalcap``.

    Args:
        predictions: One generated caption per example.
        references: One *list* of reference captions per example.

    Returns:
        Corpus METEOR in the 0-100 range to match the rest of this package.
        pycocoevalcap returns 0-1; we multiply by 100 for report parity.

    Raises:
        ImportError: If ``pycocoevalcap`` is not installed.
        ValueError: On mismatched lengths.
        RuntimeError: If the Java METEOR process cannot be launched.
    """
    if len(predictions) != len(references):
        raise ValueError(
            f"predictions ({len(predictions)}) and references "
            f"({len(references)}) must have the same length"
        )
    if not predictions:
        return 0.0

    try:
        from pycocoevalcap.meteor.meteor import Meteor
    except ImportError as e:
        raise ImportError(
            "pycocoevalcap is required for METEOR evaluation. "
            "Install via `pip install -r requirements-eval.txt`."
        ) from e

    preds = strip_sentinels_many(predictions)
    refs = strip_sentinels_references(references)

    gts = {str(i): [r for r in ref_list if r] for i, ref_list in enumerate(refs)}
    res = {str(i): [p] for i, p in enumerate(preds)}

    scorer = Meteor()
    try:
        score, _ = scorer.compute_score(gts, res)
    except Exception as e:  # — meteor.py raises bare Exceptions
        raise RuntimeError(
            "METEOR scoring failed. METEOR requires a Java runtime on PATH. "
            f"Underlying error: {e}"
        ) from e

    return float(100.0 * score)