File size: 3,820 Bytes
f6ffda2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 | #!/usr/bin/env python
#
# File Name : rouge.py
#
# Description : Computes ROUGE-L metric as described by Lin and Hovey (2004)
#
# Creation Date : 2015-01-07 06:03
# Author : Ramakrishna Vedantam <vrama91@vt.edu>
import numpy as np
import pdb
def my_lcs(string, sub):
"""
Calculates longest common subsequence for a pair of tokenized strings
:param string : list of str : tokens from a string split using whitespace
:param sub : list of str : shorter string, also split using whitespace
:returns: length (list of int): length of the longest common subsequence between the two strings
Note: my_lcs only gives length of the longest common subsequence, not the actual LCS
"""
if (len(string) < len(sub)):
sub, string = string, sub
lengths = [[0 for i in range(0, len(sub) + 1)] for j in range(0, len(string) + 1)]
for j in range(1, len(sub) + 1):
for i in range(1, len(string) + 1):
if (string[i - 1] == sub[j - 1]):
lengths[i][j] = lengths[i - 1][j - 1] + 1
else:
lengths[i][j] = max(lengths[i - 1][j], lengths[i][j - 1])
return lengths[len(string)][len(sub)]
class Rouge():
'''
Class for computing ROUGE-L score for a set of candidate sentences for the MS COCO test set
'''
def __init__(self):
# vrama91: updated the value below based on discussion with Hovey
self.beta = 1.2
def calc_score(self, candidate, refs):
"""
Compute ROUGE-L score given one candidate and references for an image
:param candidate: str : candidate sentence to be evaluated
:param refs: list of str : COCO reference sentences for the particular image to be evaluated
:returns score: int (ROUGE-L score for the candidate evaluated against references)
"""
assert (len(candidate) == 1)
assert (len(refs) > 0)
prec = []
rec = []
# split into tokens
token_c = candidate[0].split(" ")
for reference in refs:
# split into tokens
hh =1
token_r = reference.split(" ")
# compute the longest common subsequence
lcs = my_lcs(token_r, token_c)
prec.append(lcs / float(len(token_c)))
rec.append(lcs / float(len(token_r)))
prec_max = max(prec)
rec_max = max(rec)
if (prec_max != 0 and rec_max != 0):
score = ((1 + self.beta ** 2) * prec_max * rec_max) / float(rec_max + self.beta ** 2 * prec_max)
else:
score = 0.0
return score
def compute_score(self, references, hypotheses):
"""
Computes Rouge-L score given a set of reference and candidate sentences for the dataset
Invoked by evaluate_captions.py
:param hypo_for_image: dict : candidate / test sentences with "image name" key and "tokenized sentences" as values
:param ref_for_image: dict : reference MS-COCO sentences with "image name" key and "tokenized sentences" as values
:returns: average_score: float (mean ROUGE-L score computed by averaging scores for all the images)
"""
# assert (gts.keys() == res.keys())
#
# imgIds = gts.keys()
score = []
for i in range(len(hypotheses)):
hypo = hypotheses[i]
ref = references[i]
score.append(self.calc_score(hypo, ref))
# Sanity check.
assert (type(hypo) is list)
assert (len(hypo) == 1)
assert (type(ref) is list)
assert (len(ref) > 0)
average_score = np.mean(np.array(score))
return average_score, np.array(score)
def method(self):
return "Rouge" |