|
|
|
|
|
|
|
|
|
|
|
|
|
|
use utf8; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
use warnings; |
|
|
use strict; |
|
|
|
|
|
|
|
|
my $TIMES_TO_REPEAT_SUBSAMPLING = 1000; |
|
|
my $SUBSAMPLE_SIZE = 0; |
|
|
my $MAX_NGRAMS = 4; |
|
|
my $IO_ENCODING = "utf8"; |
|
|
|
|
|
|
|
|
if (@ARGV < 3) { |
|
|
print STDERR "Usage: ./bootstrap-hypothesis-difference-significance.pl hypothesis_1 hypothesis_2 reference_1 [ reference_2 ... ]\n"; |
|
|
|
|
|
unless ($ARGV[0] =~ /^(--help|-help|-h|-\?|\/\?|--usage|-usage)$/) { |
|
|
die("\nERROR: not enough arguments"); |
|
|
} |
|
|
|
|
|
exit 1; |
|
|
} |
|
|
|
|
|
print STDERR "reading data; " . `date`; |
|
|
|
|
|
|
|
|
my $data = readAllData(@ARGV); |
|
|
|
|
|
my $verbose = $ARGV[3]; |
|
|
|
|
|
|
|
|
print STDERR "performing preliminary calculations (hypothesis 1); " . `date`; |
|
|
preEvalHypo($data, "hyp1"); |
|
|
|
|
|
print STDERR "performing preliminary calculations (hypothesis 2); " . `date`; |
|
|
preEvalHypo($data, "hyp2"); |
|
|
|
|
|
|
|
|
print STDERR "comparing hypotheses -- this may take some time; " . `date`; |
|
|
|
|
|
bootstrap_report("BLEU", \&getBleu); |
|
|
bootstrap_report("NIST", \&getNist); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
sub bootstrap_report { |
|
|
my $title = shift; |
|
|
my $proc = shift; |
|
|
|
|
|
my ($subSampleScoreDiffArr, $subSampleScore1Arr, $subSampleScore2Arr) = bootstrap_pass($proc); |
|
|
|
|
|
my $realScore1 = &$proc($data->{refs}, $data->{hyp1}); |
|
|
my $realScore2 = &$proc($data->{refs}, $data->{hyp2}); |
|
|
|
|
|
my $scorePValue = bootstrap_pvalue($subSampleScoreDiffArr, $realScore1, $realScore2); |
|
|
|
|
|
my ($scoreAvg1, $scoreVar1) = bootstrap_interval($subSampleScore1Arr); |
|
|
my ($scoreAvg2, $scoreVar2) = bootstrap_interval($subSampleScore2Arr); |
|
|
|
|
|
print "\n---=== $title score ===---\n"; |
|
|
|
|
|
print "actual score of hypothesis 1: $realScore1\n"; |
|
|
print "95% confidence interval for hypothesis 1 score: $scoreAvg1 +- $scoreVar1\n-----\n"; |
|
|
print "actual score of hypothesis 1: $realScore2\n"; |
|
|
print "95% confidence interval for hypothesis 2 score: $scoreAvg2 +- $scoreVar2\n-----\n"; |
|
|
print "Assuming that essentially the same system generated the two hypothesis translations (null-hypothesis),\n"; |
|
|
print "the probability of actually getting them (p-value) is: $scorePValue.\n"; |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
sub bootstrap_pass { |
|
|
my $scoreFunc = shift; |
|
|
|
|
|
my @subSampleDiffArr; |
|
|
my @subSample1Arr; |
|
|
my @subSample2Arr; |
|
|
|
|
|
|
|
|
for my $idx (1..$TIMES_TO_REPEAT_SUBSAMPLING) { |
|
|
my $subSampleIndices = drawWithReplacement($data->{size}, ($SUBSAMPLE_SIZE? $SUBSAMPLE_SIZE: $data->{size})); |
|
|
|
|
|
my $score1 = &$scoreFunc($data->{refs}, $data->{hyp1}, $subSampleIndices); |
|
|
my $score2 = &$scoreFunc($data->{refs}, $data->{hyp2}, $subSampleIndices); |
|
|
|
|
|
push @subSampleDiffArr, abs($score2 - $score1); |
|
|
push @subSample1Arr, $score1; |
|
|
push @subSample2Arr, $score2; |
|
|
|
|
|
if ($idx % 10 == 0) { |
|
|
print STDERR "."; |
|
|
} |
|
|
if ($idx % 100 == 0) { |
|
|
print STDERR "$idx\n"; |
|
|
} |
|
|
} |
|
|
|
|
|
if ($TIMES_TO_REPEAT_SUBSAMPLING % 100 != 0) { |
|
|
print STDERR ".$TIMES_TO_REPEAT_SUBSAMPLING\n"; |
|
|
} |
|
|
|
|
|
return (\@subSampleDiffArr, \@subSample1Arr, \@subSample2Arr); |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
sub bootstrap_pvalue { |
|
|
my $subSampleDiffArr = shift; |
|
|
my $realScore1 = shift; |
|
|
my $realScore2 = shift; |
|
|
|
|
|
my $realDiff = abs($realScore2 - $realScore1); |
|
|
|
|
|
|
|
|
my $averageSubSampleDiff = 0; |
|
|
|
|
|
for my $subSampleDiff (@$subSampleDiffArr) { |
|
|
$averageSubSampleDiff += $subSampleDiff; |
|
|
} |
|
|
|
|
|
$averageSubSampleDiff /= $TIMES_TO_REPEAT_SUBSAMPLING; |
|
|
|
|
|
|
|
|
my $count = 0; |
|
|
|
|
|
my $realScoreDiff = abs($realScore2 - $realScore1); |
|
|
|
|
|
for my $subSampleDiff (@$subSampleDiffArr) { |
|
|
if ($subSampleDiff - $averageSubSampleDiff >= $realDiff) { |
|
|
$count++; |
|
|
} |
|
|
} |
|
|
|
|
|
return $count / $TIMES_TO_REPEAT_SUBSAMPLING; |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
sub bootstrap_interval { |
|
|
my $subSampleArr = shift; |
|
|
|
|
|
my @sorted = sort @$subSampleArr; |
|
|
|
|
|
my $lowerIdx = int($TIMES_TO_REPEAT_SUBSAMPLING / 40); |
|
|
my $higherIdx = $TIMES_TO_REPEAT_SUBSAMPLING - $lowerIdx - 1; |
|
|
|
|
|
my $lower = $sorted[$lowerIdx]; |
|
|
my $higher = $sorted[$higherIdx]; |
|
|
my $diff = $higher - $lower; |
|
|
|
|
|
return ($lower + 0.5 * $diff, 0.5 * $diff); |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
sub readAllData { |
|
|
my ($hypFile1, $hypFile2, @refFiles) = @_; |
|
|
|
|
|
my %result; |
|
|
|
|
|
|
|
|
$result{hyp1} = readData($hypFile1); |
|
|
$result{size} = scalar @{$result{hyp1}}; |
|
|
|
|
|
$result{hyp2} = readData($hypFile2); |
|
|
unless (scalar @{$result{hyp2}} == $result{size}) { |
|
|
die ("ERROR: sizes of hypothesis sets 1 and 2 don't match"); |
|
|
} |
|
|
|
|
|
|
|
|
$result{refs} = []; |
|
|
$result{ngramCounts} = { }; |
|
|
my $i = 0; |
|
|
|
|
|
for my $refFile (@refFiles) { |
|
|
$i++; |
|
|
my $refDataX = readData($refFile); |
|
|
|
|
|
unless (scalar @$refDataX == $result{size}) { |
|
|
die ("ERROR: ref set $i size doesn't match the size of hyp sets"); |
|
|
} |
|
|
|
|
|
updateCounts($result{ngramCounts}, $refDataX); |
|
|
|
|
|
push @{$result{refs}}, $refDataX; |
|
|
} |
|
|
|
|
|
return \%result; |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
sub updateCounts { |
|
|
my ($countHash, $refData) = @_; |
|
|
|
|
|
for my $snt(@$refData) { |
|
|
my $size = scalar @{$snt->{words}}; |
|
|
$countHash->{""} += $size; |
|
|
|
|
|
for my $order(1..$MAX_NGRAMS) { |
|
|
my $ngram; |
|
|
|
|
|
for my $i (0..($size-$order)) { |
|
|
$ngram = join(" ", @{$snt->{words}}[$i..($i + $order - 1)]); |
|
|
|
|
|
$countHash->{$ngram}++; |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
sub ngramInfo { |
|
|
my ($data, $ngram) = @_; |
|
|
|
|
|
my @nwords = split(/ /, $ngram); |
|
|
pop @nwords; |
|
|
my $smallGram = join(" ", @nwords); |
|
|
|
|
|
return log($data->{ngramCounts}->{$smallGram} / $data->{ngramCounts}->{$ngram}) / log(2.0); |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
sub readData { |
|
|
my $file = shift; |
|
|
my @result; |
|
|
|
|
|
open (FILE, $file) or die ("Failed to open `$file' for reading"); |
|
|
binmode (FILE, ":$IO_ENCODING"); |
|
|
|
|
|
while (<FILE>) { |
|
|
push @result, { words => [split(/\s+/, $_)] }; |
|
|
} |
|
|
|
|
|
close (FILE); |
|
|
|
|
|
return \@result; |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
sub preEvalHypo { |
|
|
my $data = shift; |
|
|
my $hypId = shift; |
|
|
|
|
|
for my $lineIdx (0..($data->{size} - 1)) { |
|
|
preEvalHypoSnt($data, $hypId, $lineIdx); |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
sub preEvalHypoSnt { |
|
|
my ($data, $hypId, $lineIdx) = @_; |
|
|
|
|
|
my ($correctNgramCounts, $totalNgramCounts); |
|
|
my ($refNgramCounts, $hypNgramCounts); |
|
|
my ($coocNgramInfoSum, $totalNgramAmt); |
|
|
|
|
|
my $hypSnt = $data->{$hypId}->[$lineIdx]; |
|
|
|
|
|
|
|
|
$hypSnt->{hyplen} = scalar @{$hypSnt->{words}}; |
|
|
|
|
|
|
|
|
$hypSnt->{reflen} = getClosestLength($data->{refs}, $lineIdx, $hypSnt->{hyplen}); |
|
|
$hypSnt->{avgreflen} = getAvgLength($data->{refs}, $lineIdx); |
|
|
|
|
|
$hypSnt->{correctNgrams} = []; |
|
|
$hypSnt->{totalNgrams} = []; |
|
|
|
|
|
|
|
|
for my $order (1..$MAX_NGRAMS) { |
|
|
|
|
|
$hypNgramCounts = groupNgrams($hypSnt, $order); |
|
|
|
|
|
|
|
|
$refNgramCounts = groupNgramsMultiSrc($data->{refs}, $lineIdx, $order); |
|
|
|
|
|
$correctNgramCounts = 0; |
|
|
$totalNgramCounts = 0; |
|
|
$coocNgramInfoSum = 0; |
|
|
$totalNgramAmt = 0; |
|
|
my $coocUpd; |
|
|
|
|
|
|
|
|
for my $ngram (keys %$hypNgramCounts) { |
|
|
if (!exists $refNgramCounts->{$ngram}) { |
|
|
$refNgramCounts->{$ngram} = 0; |
|
|
} |
|
|
$coocUpd = min($hypNgramCounts->{$ngram}, $refNgramCounts->{$ngram}); |
|
|
$correctNgramCounts += $coocUpd; |
|
|
$totalNgramCounts += $hypNgramCounts->{$ngram}; |
|
|
|
|
|
if ($coocUpd > 0) { |
|
|
$coocNgramInfoSum += ngramInfo($data, $ngram); |
|
|
} |
|
|
|
|
|
$totalNgramAmt++; |
|
|
} |
|
|
|
|
|
$hypSnt->{correctNgrams}->[$order] = $correctNgramCounts; |
|
|
$hypSnt->{totalNgrams}->[$order] = $totalNgramCounts; |
|
|
$hypSnt->{ngramNistInfoSum}->[$order] = $coocNgramInfoSum; |
|
|
$hypSnt->{ngramNistCount}->[$order] = $totalNgramAmt; |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
sub drawWithReplacement { |
|
|
my ($setSize, $subSize) = @_; |
|
|
|
|
|
my @result; |
|
|
|
|
|
for (1..$subSize) { |
|
|
push @result, int(rand($setSize)); |
|
|
} |
|
|
|
|
|
return \@result; |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
sub getNist { |
|
|
my ($refs, $hyp, $idxs) = @_; |
|
|
|
|
|
|
|
|
unless (defined($idxs)) { |
|
|
$idxs = [0..((scalar @$hyp) - 1)]; |
|
|
} |
|
|
|
|
|
|
|
|
my ($hypothesisLength, $referenceLength) = (0, 0); |
|
|
my (@infosum, @totalamt); |
|
|
|
|
|
|
|
|
for my $lineIdx (@$idxs) { |
|
|
|
|
|
my $hypSnt = $hyp->[$lineIdx]; |
|
|
|
|
|
|
|
|
$hypothesisLength += $hypSnt->{hyplen}; |
|
|
|
|
|
|
|
|
$referenceLength += $hypSnt->{avgreflen}; |
|
|
|
|
|
|
|
|
for my $order (1..$MAX_NGRAMS) { |
|
|
$infosum[$order] += $hypSnt->{ngramNistInfoSum}->[$order]; |
|
|
$totalamt[$order] += $hypSnt->{ngramNistCount}->[$order]; |
|
|
} |
|
|
} |
|
|
|
|
|
my $toplog = log($hypothesisLength / $referenceLength); |
|
|
my $btmlog = log(2.0/3.0); |
|
|
|
|
|
|
|
|
my $brevityPenalty = ($hypothesisLength > $referenceLength)? 1.0: exp(log(0.5) * $toplog * $toplog / ($btmlog * $btmlog)); |
|
|
|
|
|
my $sum = 0; |
|
|
|
|
|
for my $order (1..$MAX_NGRAMS) { |
|
|
$sum += $infosum[$order]/$totalamt[$order]; |
|
|
} |
|
|
|
|
|
my $result = $sum * $brevityPenalty; |
|
|
|
|
|
return $result; |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
sub getBleu { |
|
|
my ($refs, $hyp, $idxs) = @_; |
|
|
|
|
|
|
|
|
unless (defined($idxs)) { |
|
|
$idxs = [0..((scalar @$hyp) - 1)]; |
|
|
} |
|
|
|
|
|
|
|
|
my ($hypothesisLength, $referenceLength) = (0, 0); |
|
|
my (@correctNgramCounts, @totalNgramCounts); |
|
|
my ($refNgramCounts, $hypNgramCounts); |
|
|
|
|
|
|
|
|
for my $lineIdx (@$idxs) { |
|
|
my $hypSnt = $hyp->[$lineIdx]; |
|
|
|
|
|
|
|
|
$hypothesisLength += $hypSnt->{hyplen}; |
|
|
|
|
|
|
|
|
$referenceLength += $hypSnt->{reflen}; |
|
|
|
|
|
|
|
|
for my $order (1..$MAX_NGRAMS) { |
|
|
$correctNgramCounts[$order] += $hypSnt->{correctNgrams}->[$order]; |
|
|
$totalNgramCounts[$order] += $hypSnt->{totalNgrams}->[$order]; |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
my $brevityPenalty = ($hypothesisLength < $referenceLength)? exp(1 - $referenceLength/$hypothesisLength): 1; |
|
|
|
|
|
my $logsum = 0; |
|
|
|
|
|
for my $order (1..$MAX_NGRAMS) { |
|
|
$logsum += safeLog($correctNgramCounts[$order] / $totalNgramCounts[$order]); |
|
|
} |
|
|
|
|
|
return $brevityPenalty * exp($logsum / $MAX_NGRAMS); |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
sub getAvgLength { |
|
|
my ($refs, $lineIdx) = @_; |
|
|
|
|
|
my $result = 0; |
|
|
my $count = 0; |
|
|
|
|
|
for my $ref (@$refs) { |
|
|
$result += scalar @{$ref->[$lineIdx]->{words}}; |
|
|
$count++; |
|
|
} |
|
|
|
|
|
return $result / $count; |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
sub getClosestLength { |
|
|
my ($refs, $lineIdx, $hypothesisLength) = @_; |
|
|
|
|
|
my $bestDiff = infty(); |
|
|
my $bestLen = infty(); |
|
|
|
|
|
my ($currLen, $currDiff); |
|
|
|
|
|
for my $ref (@$refs) { |
|
|
$currLen = scalar @{$ref->[$lineIdx]->{words}}; |
|
|
$currDiff = abs($currLen - $hypothesisLength); |
|
|
|
|
|
if ($currDiff < $bestDiff or ($currDiff == $bestDiff and $currLen < $bestLen)) { |
|
|
$bestDiff = $currDiff; |
|
|
$bestLen = $currLen; |
|
|
} |
|
|
} |
|
|
|
|
|
return $bestLen; |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
sub groupNgrams { |
|
|
my ($snt, $order) = @_; |
|
|
my %result; |
|
|
|
|
|
my $size = scalar @{$snt->{words}}; |
|
|
my $ngram; |
|
|
|
|
|
for my $i (0..($size-$order)) { |
|
|
$ngram = join(" ", @{$snt->{words}}[$i..($i + $order - 1)]); |
|
|
|
|
|
$result{$ngram}++; |
|
|
} |
|
|
|
|
|
return \%result; |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
sub groupNgramsMultiSrc { |
|
|
my ($refs, $lineIdx, $order) = @_; |
|
|
my %result; |
|
|
|
|
|
for my $ref (@$refs) { |
|
|
my $currNgramCounts = groupNgrams($ref->[$lineIdx], $order); |
|
|
|
|
|
for my $currNgram (keys %$currNgramCounts) { |
|
|
if (!exists $result{$currNgram}) { |
|
|
$result{$currNgram} = 0; |
|
|
} |
|
|
$result{$currNgram} = max($result{$currNgram}, $currNgramCounts->{$currNgram}); |
|
|
} |
|
|
} |
|
|
|
|
|
return \%result; |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
sub safeLog { |
|
|
my $x = shift; |
|
|
|
|
|
return ($x > 0)? log($x): -infty(); |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
sub infty { |
|
|
return 1e6000; |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
sub min { |
|
|
my ($a, $b) = @_; |
|
|
|
|
|
return ($a < $b)? $a: $b; |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
sub max { |
|
|
my ($a, $b) = @_; |
|
|
|
|
|
return ($a > $b)? $a: $b; |
|
|
} |
|
|
|
|
|
sub poww { |
|
|
my ($a, $b) = @_; |
|
|
|
|
|
return exp($b * log($a)); |
|
|
} |
|
|
|