RogerFerrod's picture
upload code
f6ffda2 verified
#!/usr/bin/env python
#
# File Name : rouge.py
#
# Description : Computes ROUGE-L metric as described by Lin and Hovey (2004)
#
# Creation Date : 2015-01-07 06:03
# Author : Ramakrishna Vedantam <vrama91@vt.edu>
import numpy as np
import pdb
def my_lcs(string, sub):
"""
Calculates longest common subsequence for a pair of tokenized strings
:param string : list of str : tokens from a string split using whitespace
:param sub : list of str : shorter string, also split using whitespace
:returns: length (list of int): length of the longest common subsequence between the two strings
Note: my_lcs only gives length of the longest common subsequence, not the actual LCS
"""
if (len(string) < len(sub)):
sub, string = string, sub
lengths = [[0 for i in range(0, len(sub) + 1)] for j in range(0, len(string) + 1)]
for j in range(1, len(sub) + 1):
for i in range(1, len(string) + 1):
if (string[i - 1] == sub[j - 1]):
lengths[i][j] = lengths[i - 1][j - 1] + 1
else:
lengths[i][j] = max(lengths[i - 1][j], lengths[i][j - 1])
return lengths[len(string)][len(sub)]
class Rouge():
'''
Class for computing ROUGE-L score for a set of candidate sentences for the MS COCO test set
'''
def __init__(self):
# vrama91: updated the value below based on discussion with Hovey
self.beta = 1.2
def calc_score(self, candidate, refs):
"""
Compute ROUGE-L score given one candidate and references for an image
:param candidate: str : candidate sentence to be evaluated
:param refs: list of str : COCO reference sentences for the particular image to be evaluated
:returns score: int (ROUGE-L score for the candidate evaluated against references)
"""
assert (len(candidate) == 1)
assert (len(refs) > 0)
prec = []
rec = []
# split into tokens
token_c = candidate[0].split(" ")
for reference in refs:
# split into tokens
hh =1
token_r = reference.split(" ")
# compute the longest common subsequence
lcs = my_lcs(token_r, token_c)
prec.append(lcs / float(len(token_c)))
rec.append(lcs / float(len(token_r)))
prec_max = max(prec)
rec_max = max(rec)
if (prec_max != 0 and rec_max != 0):
score = ((1 + self.beta ** 2) * prec_max * rec_max) / float(rec_max + self.beta ** 2 * prec_max)
else:
score = 0.0
return score
def compute_score(self, references, hypotheses):
"""
Computes Rouge-L score given a set of reference and candidate sentences for the dataset
Invoked by evaluate_captions.py
:param hypo_for_image: dict : candidate / test sentences with "image name" key and "tokenized sentences" as values
:param ref_for_image: dict : reference MS-COCO sentences with "image name" key and "tokenized sentences" as values
:returns: average_score: float (mean ROUGE-L score computed by averaging scores for all the images)
"""
# assert (gts.keys() == res.keys())
#
# imgIds = gts.keys()
score = []
for i in range(len(hypotheses)):
hypo = hypotheses[i]
ref = references[i]
score.append(self.calc_score(hypo, ref))
# Sanity check.
assert (type(hypo) is list)
assert (len(hypo) == 1)
assert (type(ref) is list)
assert (len(ref) > 0)
average_score = np.mean(np.array(score))
return average_score, np.array(score)
def method(self):
return "Rouge"