#!/usr/bin/env python # # File Name : rouge.py # # Description : Computes ROUGE-L metric as described by Lin and Hovey (2004) # # Creation Date : 2015-01-07 06:03 # Author : Ramakrishna Vedantam import numpy as np import pdb def my_lcs(string, sub): """ Calculates longest common subsequence for a pair of tokenized strings :param string : list of str : tokens from a string split using whitespace :param sub : list of str : shorter string, also split using whitespace :returns: length (list of int): length of the longest common subsequence between the two strings Note: my_lcs only gives length of the longest common subsequence, not the actual LCS """ if (len(string) < len(sub)): sub, string = string, sub lengths = [[0 for i in range(0, len(sub) + 1)] for j in range(0, len(string) + 1)] for j in range(1, len(sub) + 1): for i in range(1, len(string) + 1): if (string[i - 1] == sub[j - 1]): lengths[i][j] = lengths[i - 1][j - 1] + 1 else: lengths[i][j] = max(lengths[i - 1][j], lengths[i][j - 1]) return lengths[len(string)][len(sub)] class Rouge(): ''' Class for computing ROUGE-L score for a set of candidate sentences for the MS COCO test set ''' def __init__(self): # vrama91: updated the value below based on discussion with Hovey self.beta = 1.2 def calc_score(self, candidate, refs): """ Compute ROUGE-L score given one candidate and references for an image :param candidate: str : candidate sentence to be evaluated :param refs: list of str : COCO reference sentences for the particular image to be evaluated :returns score: int (ROUGE-L score for the candidate evaluated against references) """ assert (len(candidate) == 1) assert (len(refs) > 0) prec = [] rec = [] # split into tokens token_c = candidate[0].split(" ") for reference in refs: # split into tokens hh =1 token_r = reference.split(" ") # compute the longest common subsequence lcs = my_lcs(token_r, token_c) prec.append(lcs / float(len(token_c))) rec.append(lcs / float(len(token_r))) prec_max = max(prec) rec_max = max(rec) if (prec_max != 0 and rec_max != 0): score = ((1 + self.beta ** 2) * prec_max * rec_max) / float(rec_max + self.beta ** 2 * prec_max) else: score = 0.0 return score def compute_score(self, references, hypotheses): """ Computes Rouge-L score given a set of reference and candidate sentences for the dataset Invoked by evaluate_captions.py :param hypo_for_image: dict : candidate / test sentences with "image name" key and "tokenized sentences" as values :param ref_for_image: dict : reference MS-COCO sentences with "image name" key and "tokenized sentences" as values :returns: average_score: float (mean ROUGE-L score computed by averaging scores for all the images) """ # assert (gts.keys() == res.keys()) # # imgIds = gts.keys() score = [] for i in range(len(hypotheses)): hypo = hypotheses[i] ref = references[i] score.append(self.calc_score(hypo, ref)) # Sanity check. assert (type(hypo) is list) assert (len(hypo) == 1) assert (type(ref) is list) assert (len(ref) > 0) average_score = np.mean(np.array(score)) return average_score, np.array(score) def method(self): return "Rouge"