File size: 3,820 Bytes

f6ffda2

#!/usr/bin/env python

#

# File Name : rouge.py

#

# Description : Computes ROUGE-L metric as described by Lin and Hovey (2004)

#

# Creation Date : 2015-01-07 06:03

# Author : Ramakrishna Vedantam <vrama91@vt.edu>


import numpy as np

import pdb


def my_lcs(string, sub):
    """

    Calculates longest common subsequence for a pair of tokenized strings

    :param string : list of str : tokens from a string split using whitespace

    :param sub : list of str : shorter string, also split using whitespace

    :returns: length (list of int): length of the longest common subsequence between the two strings



    Note: my_lcs only gives length of the longest common subsequence, not the actual LCS

    """

    if (len(string) < len(sub)):
        sub, string = string, sub

    lengths = [[0 for i in range(0, len(sub) + 1)] for j in range(0, len(string) + 1)]

    for j in range(1, len(sub) + 1):

        for i in range(1, len(string) + 1):

            if (string[i - 1] == sub[j - 1]):

                lengths[i][j] = lengths[i - 1][j - 1] + 1

            else:

                lengths[i][j] = max(lengths[i - 1][j], lengths[i][j - 1])

    return lengths[len(string)][len(sub)]


class Rouge():
    '''

    Class for computing ROUGE-L score for a set of candidate sentences for the MS COCO test set



    '''

    def __init__(self):

        # vrama91: updated the value below based on discussion with Hovey

        self.beta = 1.2

    def calc_score(self, candidate, refs):

        """

        Compute ROUGE-L score given one candidate and references for an image

        :param candidate: str : candidate sentence to be evaluated

        :param refs: list of str : COCO reference sentences for the particular image to be evaluated

        :returns score: int (ROUGE-L score for the candidate evaluated against references)

        """


        assert (len(candidate) == 1)

        assert (len(refs) > 0)

        prec = []

        rec = []

        # split into tokens

        token_c = candidate[0].split(" ")

        for reference in refs:
            # split into tokens
            hh =1

            token_r = reference.split(" ")

            # compute the longest common subsequence

            lcs = my_lcs(token_r, token_c)

            prec.append(lcs / float(len(token_c)))

            rec.append(lcs / float(len(token_r)))

        prec_max = max(prec)

        rec_max = max(rec)

        if (prec_max != 0 and rec_max != 0):

            score = ((1 + self.beta ** 2) * prec_max * rec_max) / float(rec_max + self.beta ** 2 * prec_max)

        else:

            score = 0.0

        return score

    def compute_score(self, references, hypotheses):

        """

        Computes Rouge-L score given a set of reference and candidate sentences for the dataset

        Invoked by evaluate_captions.py

        :param hypo_for_image: dict : candidate / test sentences with "image name" key and "tokenized sentences" as values

        :param ref_for_image: dict : reference MS-COCO sentences with "image name" key and "tokenized sentences" as values

        :returns: average_score: float (mean ROUGE-L score computed by averaging scores for all the images)

        """

        # assert (gts.keys() == res.keys())
        #
        # imgIds = gts.keys()

        score = []

        for i in range(len(hypotheses)):
            hypo = hypotheses[i]
            ref = references[i]

            score.append(self.calc_score(hypo, ref))

            # Sanity check.

            assert (type(hypo) is list)

            assert (len(hypo) == 1)

            assert (type(ref) is list)

            assert (len(ref) > 0)

        average_score = np.mean(np.array(score))

        return average_score, np.array(score)

    def method(self):

        return "Rouge"