Continual / root_gainlora /src /rouge /scoring_test.py
natmin322's picture
add root_gainlora to repo for testing
e2bef95
# coding=utf-8
# Copyright 2022 The Google Research Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for rouge scoring and aggregation.
Checks for both correctness, and for consistency with values from the perl ROUGE
implementation which this package replicates.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
from absl.testing import absltest
import numpy as np
from six.moves import range
from six.moves import zip
from rouge import rouge_scorer
from rouge import scoring
from rouge import test_util
# Delta for matching against ground truth rouge values. Must be relatively
# high compared to the individual rouge tests since bootstrap sampling
# introduces randomness.
_DELTA = 0.002
# Use a fixed random seed, or tests may fail with nonzero probability.
_RANDOM_SEED = 123
class BootstrapAggregatorTest(absltest.TestCase):
def setUp(self):
super(BootstrapAggregatorTest, self).setUp()
np.random.seed(_RANDOM_SEED)
with open(test_util.LARGE_TARGETS_FILE) as f:
self.targets = f.readlines()
with open(test_util.LARGE_PREDICTIONS_FILE) as f:
self.predictions = f.readlines()
def assertSimilarAggregates(self, precision, recall, fmeasure, aggregate,
delta=_DELTA):
"""Helper method for asserting matching aggregate scores.
Args:
precision: Tuple of (low, mid, high) precision scores.
recall: Tuple of (low, mid, high) recall scores.
fmeasure: Tuple of (low, mid, high) fmeasure scores.
aggregate: An AggregateScore object.
delta: Tolerance delta for matching values.
"""
self.assertAlmostEqual(precision[0], aggregate.low.precision, delta=delta)
self.assertAlmostEqual(precision[1], aggregate.mid.precision, delta=delta)
self.assertAlmostEqual(precision[2], aggregate.high.precision, delta=delta)
self.assertAlmostEqual(recall[0], aggregate.low.recall, delta=delta)
self.assertAlmostEqual(recall[1], aggregate.mid.recall, delta=delta)
self.assertAlmostEqual(recall[2], aggregate.high.recall, delta=delta)
self.assertAlmostEqual(fmeasure[0], aggregate.low.fmeasure, delta=delta)
self.assertAlmostEqual(fmeasure[1], aggregate.mid.fmeasure, delta=delta)
self.assertAlmostEqual(fmeasure[2], aggregate.high.fmeasure, delta=delta)
def testConsistentPercentiles(self):
aggregator = scoring.BootstrapAggregator(confidence_interval=0.9)
aggregator.add_scores({
"rouge1": scoring.Score(precision=1, recall=1 / 3, fmeasure=1 / 2)
})
aggregator.add_scores({
"rouge1": scoring.Score(precision=0, recall=0, fmeasure=0)
})
aggregator.add_scores({
"rouge1": scoring.Score(precision=1, recall=1, fmeasure=1)
})
result = aggregator.aggregate()
self.assertSimilarAggregates((1 / 3, 2 / 3, 3 / 3),
(1 / 9, 4 / 9, 7 / 9),
(1 / 6, 3 / 6, 5 / 6),
result["rouge1"], delta=1e-8)
def testLargeConfidence(self):
aggregator = scoring.BootstrapAggregator(confidence_interval=0.0)
aggregator.add_scores({
"rouge1": scoring.Score(precision=1, recall=1 / 3, fmeasure=1 / 2)
})
aggregator.add_scores({
"rouge1": scoring.Score(precision=0, recall=0, fmeasure=0)
})
aggregator.add_scores({
"rouge1": scoring.Score(precision=1, recall=1, fmeasure=1)
})
result = aggregator.aggregate()
self.assertSimilarAggregates((2 / 3, 2 / 3, 2 / 3),
(4 / 9, 4 / 9, 4 / 9),
(3 / 6, 3 / 6, 3 / 6),
result["rouge1"], delta=1e-8)
def testMultipleRougeTypes(self):
scorer = rouge_scorer.RougeScorer(["rouge1", "rougeL"], use_stemmer=False)
aggregator = scoring.BootstrapAggregator()
for target, prediction in zip(self.targets[:5], self.predictions[:5]):
aggregator.add_scores(scorer.score(target, prediction))
result = aggregator.aggregate()
self.assertSameElements(list(result.keys()), ["rouge1", "rougeL"])
def testConfidenceIntervalsAgainstRouge155(self):
scorer = rouge_scorer.RougeScorer(["rouge1"], use_stemmer=False)
aggregator = scoring.BootstrapAggregator()
for target, prediction in zip(self.targets, self.predictions):
aggregator.add_scores(scorer.score(target, prediction))
result = aggregator.aggregate()
self.assertSimilarAggregates((0.48695, 0.49879, 0.51131),
(0.31106, 0.31950, 0.32849),
(0.37614, 0.38554, 0.39581),
result["rouge1"])
def testConfidenceIntervalsAgainstRouge155WithStemming(self):
scorer = rouge_scorer.RougeScorer(["rouge1", "rougeL"], use_stemmer=True)
aggregator = scoring.BootstrapAggregator()
for target, prediction in zip(self.targets, self.predictions):
aggregator.add_scores(scorer.score(target, prediction))
result = aggregator.aggregate()
self.assertSimilarAggregates((0.51027, 0.52434, 0.53788),
(0.32563, 0.33580, 0.34548),
(0.39380, 0.40524, 0.41661),
result["rouge1"])
self.assertSimilarAggregates((0.50759, 0.52104, 0.53382), # P
(0.32418, 0.33377, 0.34362), # R
(0.39157, 0.40275, 0.41383), # F
result["rougeL"])
def testConfidenceIntervalsAgainstRouge155WithStemmingMultiLine(self):
scorer = rouge_scorer.RougeScorer(
["rouge1", "rouge2", "rougeLsum"], use_stemmer=True)
aggregator = scoring.BootstrapAggregator()
t_files = [os.path.join(test_util.PYROUGE_DIR, 'target_multi.%d.txt' % i) for i in range(0, 250)]
p_files = [os.path.join(test_util.PYROUGE_DIR, 'prediction_multi.%d.txt' % i) for i in range(0, 250)]
targets = [test_util.get_text(x) for x in t_files]
predictions = [test_util.get_text(x) for x in p_files]
assert len(targets) == len(predictions)
assert len(targets) == 250
for target, prediction in zip(targets, predictions):
aggregator.add_scores(scorer.score(target, prediction))
result = aggregator.aggregate()
# DIR = testdata/pyrouge_evaluate_plain_text_files
# pyrouge_evaluate_plain_text_files -s $DIR -sfp "prediction_multi.(.*).txt"
# -m $DIR -mfp target_multi.#ID#.txt
self.assertSimilarAggregates((0.58963, 0.59877, 0.60822), # P
(0.37327, 0.38091, 0.38914), # R
(0.45607, 0.46411, 0.47244), # F
result["rouge1"])
self.assertSimilarAggregates((0.35429, 0.36516, 0.37665), # P
(0.22341, 0.23109, 0.23916), # R
(0.27312, 0.28209, 0.29133), # F
result["rouge2"])
self.assertSimilarAggregates((0.58604, 0.59491, 0.60444), # P
(0.37084, 0.37846, 0.38671), # R
(0.45305, 0.46113, 0.46946), # F
result["rougeLsum"])
if __name__ == "__main__":
absltest.main()