File size: 3,820 Bytes
f6ffda2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
#!/usr/bin/env python

#

# File Name : rouge.py

#

# Description : Computes ROUGE-L metric as described by Lin and Hovey (2004)

#

# Creation Date : 2015-01-07 06:03

# Author : Ramakrishna Vedantam <vrama91@vt.edu>


import numpy as np

import pdb


def my_lcs(string, sub):
    """

    Calculates longest common subsequence for a pair of tokenized strings

    :param string : list of str : tokens from a string split using whitespace

    :param sub : list of str : shorter string, also split using whitespace

    :returns: length (list of int): length of the longest common subsequence between the two strings



    Note: my_lcs only gives length of the longest common subsequence, not the actual LCS

    """

    if (len(string) < len(sub)):
        sub, string = string, sub

    lengths = [[0 for i in range(0, len(sub) + 1)] for j in range(0, len(string) + 1)]

    for j in range(1, len(sub) + 1):

        for i in range(1, len(string) + 1):

            if (string[i - 1] == sub[j - 1]):

                lengths[i][j] = lengths[i - 1][j - 1] + 1

            else:

                lengths[i][j] = max(lengths[i - 1][j], lengths[i][j - 1])

    return lengths[len(string)][len(sub)]


class Rouge():
    '''

    Class for computing ROUGE-L score for a set of candidate sentences for the MS COCO test set



    '''

    def __init__(self):

        # vrama91: updated the value below based on discussion with Hovey

        self.beta = 1.2

    def calc_score(self, candidate, refs):

        """

        Compute ROUGE-L score given one candidate and references for an image

        :param candidate: str : candidate sentence to be evaluated

        :param refs: list of str : COCO reference sentences for the particular image to be evaluated

        :returns score: int (ROUGE-L score for the candidate evaluated against references)

        """


        assert (len(candidate) == 1)

        assert (len(refs) > 0)

        prec = []

        rec = []

        # split into tokens

        token_c = candidate[0].split(" ")

        for reference in refs:
            # split into tokens
            hh =1

            token_r = reference.split(" ")

            # compute the longest common subsequence

            lcs = my_lcs(token_r, token_c)

            prec.append(lcs / float(len(token_c)))

            rec.append(lcs / float(len(token_r)))

        prec_max = max(prec)

        rec_max = max(rec)

        if (prec_max != 0 and rec_max != 0):

            score = ((1 + self.beta ** 2) * prec_max * rec_max) / float(rec_max + self.beta ** 2 * prec_max)

        else:

            score = 0.0

        return score

    def compute_score(self, references, hypotheses):

        """

        Computes Rouge-L score given a set of reference and candidate sentences for the dataset

        Invoked by evaluate_captions.py

        :param hypo_for_image: dict : candidate / test sentences with "image name" key and "tokenized sentences" as values

        :param ref_for_image: dict : reference MS-COCO sentences with "image name" key and "tokenized sentences" as values

        :returns: average_score: float (mean ROUGE-L score computed by averaging scores for all the images)

        """

        # assert (gts.keys() == res.keys())
        #
        # imgIds = gts.keys()

        score = []

        for i in range(len(hypotheses)):
            hypo = hypotheses[i]
            ref = references[i]

            score.append(self.calc_score(hypo, ref))

            # Sanity check.

            assert (type(hypo) is list)

            assert (len(hypo) == 1)

            assert (type(ref) is list)

            assert (len(ref) > 0)

        average_score = np.mean(np.array(score))

        return average_score, np.array(score)

    def method(self):

        return "Rouge"