File size: 4,575 Bytes
ecd5dc6
9fa3156
 
 
 
ecd5dc6
9fa3156
 
ecd5dc6
 
 
 
 
 
9fa3156
 
 
 
ecd5dc6
 
 
 
 
 
 
9fa3156
 
 
 
ecd5dc6
 
9fa3156
ecd5dc6
 
9fa3156
ecd5dc6
9fa3156
ecd5dc6
 
 
 
 
 
 
 
 
 
 
 
 
9fa3156
 
 
 
 
ecd5dc6
9fa3156
 
 
 
 
 
 
 
 
df29cfd
 
 
 
 
 
 
 
 
 
 
 
 
 
9fa3156
ecd5dc6
9fa3156
ecd5dc6
 
 
 
 
 
 
 
 
 
9fa3156
 
 
 
ecd5dc6
 
 
 
 
 
df29cfd
 
 
 
 
 
 
 
 
 
ecd5dc6
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
"""This module implements the CIDEr metric for image captioning evaluation."""

import evaluate
import datasets

from .cider_scorer import CiderScorer

_CITATION = """\
@InProceedings{Vedantam_2015_CVPR,
    author = {Vedantam, Ramakrishna and Lawrence Zitnick, C. and Parikh, Devi},
    title = {CIDEr: Consensus-Based Image Description Evaluation},
    booktitle = {Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
    month = {June},
    year = {2015}
}
"""

_DESCRIPTION = """\
This is a metric to evaluate image captioning. It is based on the idea of
measuring the consensus between a candidate image caption and a set of
reference image captions written by humans. The CIDEr score is computed by
comparing the n-grams of the candidate caption to the n-grams of the reference
captions, and measuring how many n-grams are shared between the candidate and
the references. The score is then normalized by the length of the candidate
caption and the number of reference captions.
"""


_KWARGS_DESCRIPTION = """
CIDEr (Consensus-based Image Description Evaluation) is a metric for evaluating the quality of image captions.
It measures how similar a generated caption is to a set of reference captions written by humans.
Args:
    predictions: list of predictions to score.
    references: list of references for each prediction.
Returns:
    score: CIDEr score.
Examples:
    >>> metric = evaluate.load("sunhill/cider")
    >>> results = metric.compute(
        predictions=[['train traveling down a track in front of a road']],
        references=[
            [
                'a train traveling down tracks next to lights',
                'a blue and silver train next to train station and trees',
                'a blue train is next to a sidewalk on the rails',
                'a passenger train pulls into a train station',
                'a train coming down the tracks arriving at a station'
            ]
        ]
    )
"""


@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
class CIDEr(evaluate.Metric):
    """CIDEr metric."""

    def _info(self):
        return evaluate.MetricInfo(
            # This is the description that will appear on the modules page.
            module_type="metric",
            description=_DESCRIPTION,
            citation=_CITATION,
            inputs_description=_KWARGS_DESCRIPTION,
            # This defines the format of each prediction and reference
            features=[
                datasets.Features(
                    {
                        "predictions": datasets.Value("string"),
                        "references": datasets.Value("string"),
                    }
                ),
                datasets.Features(
                    {
                        "predictions": datasets.Value("string"),
                        "references": datasets.Sequence(datasets.Value("string")),
                    }
                ),
            ],
            # Homepage of the module for documentation
            homepage="https://huggingface.co/spaces/sunhill/cider",
            # Additional links to the codebase or references
            codebase_urls=[
                "https://github.com/ramavedantam/cider",
                "https://github.com/EricWWWW/image-caption-metrics",
            ],
            reference_urls=[
                (
                    "https://openaccess.thecvf.com/content_cvpr_2015/html/"
                    "Vedantam_CIDEr_Consensus-Based_Image_2015_CVPR_paper.html"
                )
            ],
        )

    def _compute(self, predictions, references):
        """Returns the scores"""
        assert len(predictions) == len(references), (
            "The number of predictions and references should be the same. "
            f"Got {len(predictions)} predictions and {len(references)} references."
        )
        cider_scorer = CiderScorer(n=4, sigma=6.0)
        for pred, ref in zip(predictions, references):
            assert isinstance(pred, str), (
                f"Each prediction should be a string. Got {type(pred)}."
            )
            if isinstance(ref, str):
                ref = [ref]
            assert isinstance(ref, list) and all(isinstance(r, str) for r in ref), (
                "Each reference should be a list of strings. "
                f"Got {type(ref)} with elements of type {[type(r) for r in ref]}."
            )
            cider_scorer += (pred, ref)
        score, _ = cider_scorer.compute_score()
        return {"cider_score": score.item()}