# coding=utf-8 # Copyright 2022 The Google Research Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Tests for rouge scoring and aggregation. Checks for both correctness, and for consistency with values from the perl ROUGE implementation which this package replicates. """ from __future__ import absolute_import from __future__ import division from __future__ import print_function import os from absl.testing import absltest import numpy as np from six.moves import range from six.moves import zip from rouge import rouge_scorer from rouge import scoring from rouge import test_util # Delta for matching against ground truth rouge values. Must be relatively # high compared to the individual rouge tests since bootstrap sampling # introduces randomness. _DELTA = 0.002 # Use a fixed random seed, or tests may fail with nonzero probability. _RANDOM_SEED = 123 class BootstrapAggregatorTest(absltest.TestCase): def setUp(self): super(BootstrapAggregatorTest, self).setUp() np.random.seed(_RANDOM_SEED) with open(test_util.LARGE_TARGETS_FILE) as f: self.targets = f.readlines() with open(test_util.LARGE_PREDICTIONS_FILE) as f: self.predictions = f.readlines() def assertSimilarAggregates(self, precision, recall, fmeasure, aggregate, delta=_DELTA): """Helper method for asserting matching aggregate scores. Args: precision: Tuple of (low, mid, high) precision scores. recall: Tuple of (low, mid, high) recall scores. fmeasure: Tuple of (low, mid, high) fmeasure scores. aggregate: An AggregateScore object. delta: Tolerance delta for matching values. """ self.assertAlmostEqual(precision[0], aggregate.low.precision, delta=delta) self.assertAlmostEqual(precision[1], aggregate.mid.precision, delta=delta) self.assertAlmostEqual(precision[2], aggregate.high.precision, delta=delta) self.assertAlmostEqual(recall[0], aggregate.low.recall, delta=delta) self.assertAlmostEqual(recall[1], aggregate.mid.recall, delta=delta) self.assertAlmostEqual(recall[2], aggregate.high.recall, delta=delta) self.assertAlmostEqual(fmeasure[0], aggregate.low.fmeasure, delta=delta) self.assertAlmostEqual(fmeasure[1], aggregate.mid.fmeasure, delta=delta) self.assertAlmostEqual(fmeasure[2], aggregate.high.fmeasure, delta=delta) def testConsistentPercentiles(self): aggregator = scoring.BootstrapAggregator(confidence_interval=0.9) aggregator.add_scores({ "rouge1": scoring.Score(precision=1, recall=1 / 3, fmeasure=1 / 2) }) aggregator.add_scores({ "rouge1": scoring.Score(precision=0, recall=0, fmeasure=0) }) aggregator.add_scores({ "rouge1": scoring.Score(precision=1, recall=1, fmeasure=1) }) result = aggregator.aggregate() self.assertSimilarAggregates((1 / 3, 2 / 3, 3 / 3), (1 / 9, 4 / 9, 7 / 9), (1 / 6, 3 / 6, 5 / 6), result["rouge1"], delta=1e-8) def testLargeConfidence(self): aggregator = scoring.BootstrapAggregator(confidence_interval=0.0) aggregator.add_scores({ "rouge1": scoring.Score(precision=1, recall=1 / 3, fmeasure=1 / 2) }) aggregator.add_scores({ "rouge1": scoring.Score(precision=0, recall=0, fmeasure=0) }) aggregator.add_scores({ "rouge1": scoring.Score(precision=1, recall=1, fmeasure=1) }) result = aggregator.aggregate() self.assertSimilarAggregates((2 / 3, 2 / 3, 2 / 3), (4 / 9, 4 / 9, 4 / 9), (3 / 6, 3 / 6, 3 / 6), result["rouge1"], delta=1e-8) def testMultipleRougeTypes(self): scorer = rouge_scorer.RougeScorer(["rouge1", "rougeL"], use_stemmer=False) aggregator = scoring.BootstrapAggregator() for target, prediction in zip(self.targets[:5], self.predictions[:5]): aggregator.add_scores(scorer.score(target, prediction)) result = aggregator.aggregate() self.assertSameElements(list(result.keys()), ["rouge1", "rougeL"]) def testConfidenceIntervalsAgainstRouge155(self): scorer = rouge_scorer.RougeScorer(["rouge1"], use_stemmer=False) aggregator = scoring.BootstrapAggregator() for target, prediction in zip(self.targets, self.predictions): aggregator.add_scores(scorer.score(target, prediction)) result = aggregator.aggregate() self.assertSimilarAggregates((0.48695, 0.49879, 0.51131), (0.31106, 0.31950, 0.32849), (0.37614, 0.38554, 0.39581), result["rouge1"]) def testConfidenceIntervalsAgainstRouge155WithStemming(self): scorer = rouge_scorer.RougeScorer(["rouge1", "rougeL"], use_stemmer=True) aggregator = scoring.BootstrapAggregator() for target, prediction in zip(self.targets, self.predictions): aggregator.add_scores(scorer.score(target, prediction)) result = aggregator.aggregate() self.assertSimilarAggregates((0.51027, 0.52434, 0.53788), (0.32563, 0.33580, 0.34548), (0.39380, 0.40524, 0.41661), result["rouge1"]) self.assertSimilarAggregates((0.50759, 0.52104, 0.53382), # P (0.32418, 0.33377, 0.34362), # R (0.39157, 0.40275, 0.41383), # F result["rougeL"]) def testConfidenceIntervalsAgainstRouge155WithStemmingMultiLine(self): scorer = rouge_scorer.RougeScorer( ["rouge1", "rouge2", "rougeLsum"], use_stemmer=True) aggregator = scoring.BootstrapAggregator() t_files = [os.path.join(test_util.PYROUGE_DIR, 'target_multi.%d.txt' % i) for i in range(0, 250)] p_files = [os.path.join(test_util.PYROUGE_DIR, 'prediction_multi.%d.txt' % i) for i in range(0, 250)] targets = [test_util.get_text(x) for x in t_files] predictions = [test_util.get_text(x) for x in p_files] assert len(targets) == len(predictions) assert len(targets) == 250 for target, prediction in zip(targets, predictions): aggregator.add_scores(scorer.score(target, prediction)) result = aggregator.aggregate() # DIR = testdata/pyrouge_evaluate_plain_text_files # pyrouge_evaluate_plain_text_files -s $DIR -sfp "prediction_multi.(.*).txt" # -m $DIR -mfp target_multi.#ID#.txt self.assertSimilarAggregates((0.58963, 0.59877, 0.60822), # P (0.37327, 0.38091, 0.38914), # R (0.45607, 0.46411, 0.47244), # F result["rouge1"]) self.assertSimilarAggregates((0.35429, 0.36516, 0.37665), # P (0.22341, 0.23109, 0.23916), # R (0.27312, 0.28209, 0.29133), # F result["rouge2"]) self.assertSimilarAggregates((0.58604, 0.59491, 0.60444), # P (0.37084, 0.37846, 0.38671), # R (0.45305, 0.46113, 0.46946), # F result["rougeLsum"]) if __name__ == "__main__": absltest.main()