| | |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | use strict; |
| | use Getopt::Long; |
| | use Pod::Usage; |
| | use vars qw($Verbose $CER $IgnoreUttID); |
| | use utf8; |
| |
|
| | my ($help,%hyphash); |
| | GetOptions( |
| | 'help|?' => \$help, |
| | 'verbose|v' => \$Verbose, |
| | 'cer|c' => \$CER, |
| | 'ignore-uttid|i' => \$IgnoreUttID, |
| | ) or pod2usage(1); |
| | pod2usage(1) if $help; |
| |
|
| | pod2usage(2) unless @ARGV == 2; |
| | my ($ref, $hyp) = @ARGV; |
| |
|
| | open HYP, "<$hyp" or die "Failed to open $hyp: $!"; |
| | while (defined(my $hyp_utt=<HYP>)){ |
| | my $hyp_uttid; |
| | ($hyp_utt, $hyp_uttid) = s3_magic_norm($hyp_utt); |
| | $hyphash{$hyp_uttid} = "$hyp_utt ($hyp_uttid)"; |
| | } |
| | close HYP; |
| |
|
| | open REF, "<$ref" or die "Failed to open $ref: $!"; |
| | open HYP, "<$hyp" or die "Failed to open $hyp: $!"; |
| |
|
| | use constant INS => 1; |
| | use constant DEL => 2; |
| | use constant MATCH => 3; |
| | use constant SUBST => 4; |
| | use constant BIG_NUMBER => 1e50; |
| |
|
| | my ($total_words, $total_match, $total_cost, $total_hyp); |
| | my ($total_ins, $total_del, $total_subst); |
| | while (defined(my $ref_utt = <REF>)) { |
| | my $hyp_utt; |
| | my $ref_uttid; |
| | my $hyp_uttid; |
| |
|
| | last unless defined $ref_utt; |
| |
|
| | ($ref_utt,$ref_uttid)=s3_magic_norm($ref_utt); |
| |
|
| | if(defined $IgnoreUttID){ |
| | $hyp_utt = <HYP>; |
| | die "UttID is ignored but file size mismatch between $ref and $hyp" unless defined($hyp_utt); |
| | }else{ |
| | $hyp_utt=$hyphash{$ref_uttid}; |
| | die "UttID is not ignored but it could not found in any entries of the hypothesis file on line3 $. UTTID: $ref_uttid\n" unless defined($hyp_utt); |
| | } |
| |
|
| | ($hyp_utt,$hyp_uttid)=s3_magic_norm($hyp_utt); |
| |
|
| | if(! defined $IgnoreUttID){ |
| | die "Utterance ID mismatch on line $.: $ref_uttid != $hyp_uttid" |
| | unless $ref_uttid eq $hyp_uttid; |
| | } |
| |
|
| | |
| | my @ref_words = split ' ', $ref_utt; |
| | my @hyp_words = split ' ', $hyp_utt; |
| | if ($CER) { |
| | |
| | @ref_words = map { split "" } @ref_words; |
| | @hyp_words = map { split "" } @hyp_words; |
| | } |
| |
|
| | my (@align_matrix, @backtrace_matrix); |
| |
|
| | |
| | initialize(\@ref_words, \@hyp_words, \@align_matrix, \@backtrace_matrix); |
| | |
| | my $cost = align(\@ref_words, \@hyp_words, \@align_matrix, \@backtrace_matrix); |
| | |
| | my ($alignment, $ins, $del, $subst, $match) = backtrace(\@ref_words, \@hyp_words, |
| | \@align_matrix, \@backtrace_matrix); |
| |
|
| | |
| | my ($ref_align, $hyp_align) = ("", ""); |
| | foreach (@$alignment) { |
| | my ($ref, $hyp) = @$_; |
| | my $width = 0; |
| |
|
| | if (defined($ref) and defined($hyp)) { |
| | if ($CER or |
| | ($ref =~ /\p{InCJKUnifiedIdeographs}/ or |
| | $ref =~ /\p{Han}/ or |
| | $hyp =~ /\p{Han}/)) { |
| | |
| | if ($ref ne $hyp) { |
| | $ref = "*$ref*"; |
| | $hyp = "*$hyp*"; |
| | } |
| | } elsif ($ref eq $hyp) { |
| | |
| | $ref = lc $ref; |
| | $hyp = lc $hyp; |
| | } |
| | } |
| |
|
| | |
| | foreach ($ref, $hyp) { $_ = "***" unless defined $_ }; |
| |
|
| | |
| | foreach ($ref, $hyp) { $width = length if length > $width }; |
| | $width = 3 if $width < 3; |
| |
|
| | |
| | $ref_align .= sprintf("%-*s ", $width, $ref); |
| | $hyp_align .= sprintf("%-*s ", $width, $hyp); |
| | } |
| | $ref_uttid = "" unless defined $ref_uttid; |
| | print "$ref_align ($ref_uttid)\n$hyp_align ($hyp_uttid)\n"; |
| |
|
| | |
| | my $error = @ref_words == 0 ? 1 : $cost/@ref_words; |
| | my $acc = @ref_words == 0 ? 0 : $match/@ref_words; |
| | printf("Words: %d Correct: %d Errors: %d Percent correct = %.2f%% Error = %.2f%% Accuracy = %.2f%%\n", |
| | scalar(@ref_words), $match, $cost, $acc*100, $error*100, 100-$error*100); |
| | print "Insertions: $ins Deletions: $del Substitutions: $subst\n"; |
| |
|
| | $total_cost += $cost; |
| | $total_match += $match; |
| | $total_words += @ref_words; |
| | $total_hyp += @hyp_words; |
| | $total_ins += $ins; |
| | $total_del += $del; |
| | $total_subst += $subst; |
| | } |
| | |
| | my ($error, $acc); |
| | if ($total_words == 0) { |
| | $error = $total_cost/$total_hyp; |
| | $acc = $total_match/$total_hyp; |
| | } |
| | else { |
| | $error = $total_cost/$total_words; |
| | $acc = $total_match/$total_words; |
| | } |
| | printf("TOTAL Words: %d Correct: %d Errors: %d\nTOTAL Percent correct = %.2f%% Error = %.2f%% Accuracy = %.2f%%\n", |
| | $total_words, $total_match, $total_cost, $acc*100, $error*100, 100-$error*100); |
| | print "TOTAL Insertions: $total_ins Deletions: $total_del Substitutions: $total_subst\n"; |
| |
|
| | |
| | sub s3_magic_norm{ |
| | my ($word)=@_; |
| | my $uttid; |
| |
|
| | |
| | $word =~ s/[\n\r]+$//; |
| |
|
| | |
| | $word =~ s/\(([^) ]+)[^)]*\)$// ; |
| | $uttid = $1; |
| |
|
| | |
| | $word = uc $word; |
| | |
| | $word =~ s/<[^>]+>//g; |
| | $word =~ s/\+\+[^+]+\+\+//g; |
| | $word =~ s/\+[^+]+\+//g; |
| |
|
| | |
| | $word =~ s/\([1-9]\)//g; |
| |
|
| | |
| | $word =~ s/:\S+//g; |
| |
|
| | |
| | $word =~ tr/-_./ /; |
| |
|
| | return ($word,$uttid); |
| | } |
| |
|
| | sub initialize { |
| | my ($ref_words, $hyp_words, $align_matrix, $backtrace_matrix) = @_; |
| |
|
| | |
| | for (my $j = 0; $j <= @$hyp_words; ++$j) { |
| | $$align_matrix[0][$j] = $j; |
| | } |
| | for (my $j = 0; $j <= @$hyp_words; ++$j) { |
| | $$backtrace_matrix[0][$j] = INS; |
| | } |
| | |
| | for (my $i = 0; $i <= @$ref_words; ++$i) { |
| | $$align_matrix[$i][0] = $i; |
| | } |
| | for (my $i = 0; $i <= @$ref_words; ++$i) { |
| | $$backtrace_matrix[$i][0] = DEL; |
| | } |
| | } |
| |
|
| | sub align { |
| | my ($ref_words, $hyp_words, $align_matrix, $backtrace_matrix) = @_; |
| |
|
| | for (my $i = 1; $i <= @$ref_words; ++$i) { |
| | for (my $j = 1; $j <= @$hyp_words; ++$j) { |
| | |
| | my ($ins, $del, $subst); |
| |
|
| | |
| | my $cost = $$ref_words[$i-1] ne $$hyp_words[$j-1]; |
| |
|
| | |
| | $ins = $$align_matrix[$i][$j-1] + 1; |
| | $del = $$align_matrix[$i-1][$j] + 1; |
| | $subst = $$align_matrix[$i-1][$j-1] + $cost; |
| | print "Costs at $i $j: INS $ins DEL $del SUBST $subst\n" if $Verbose; |
| |
|
| | |
| | my $min = BIG_NUMBER; |
| | foreach ($ins, $del, $subst) { |
| | if ($_ < $min) { |
| | $min = $_; |
| | } |
| | } |
| | $$align_matrix[$i][$j] = $min; |
| |
|
| | |
| | |
| | if ($min == $subst) { |
| | print(($cost ? "SUBSTITUTION" : "MATCH"), |
| | "($$ref_words[$i-1] <=> $$hyp_words[$j-1])\n") if $Verbose; |
| | $$backtrace_matrix[$i][$j] = MATCH+$cost; |
| | } |
| | elsif ($min == $ins) { |
| | print "INSERTION (0 => $$hyp_words[$j-1])\n" if $Verbose; |
| | $$backtrace_matrix[$i][$j] = INS; |
| | } |
| | elsif ($min == $del) { |
| | print "DELETION ($$ref_words[$i-1] => 0)\n" if $Verbose; |
| | $$backtrace_matrix[$i][$j] = DEL; |
| | } |
| | } |
| | } |
| | return $$align_matrix[@$ref_words][@$hyp_words]; |
| | } |
| |
|
| | sub backtrace { |
| | my ($ref_words, $hyp_words, $align_matrix, $backtrace_matrix) = @_; |
| |
|
| | |
| | my @alignment; |
| | my $i = @$ref_words; |
| | my $j = @$hyp_words; |
| | my ($inspen, $delpen, $substpen, $match) = (0,0,0,0); |
| | while (!($i == 0 and $j == 0)) { |
| | my $pointer = $$backtrace_matrix[$i][$j]; |
| | print "Cost at $i $j: $$align_matrix[$i][$j]\n" |
| | if $Verbose; |
| | if ($pointer == INS) { |
| | print "INSERTION (0 => $$hyp_words[$j-1])" if $Verbose; |
| | |
| | unshift @alignment, [undef, $$hyp_words[$j-1]]; |
| | ++$inspen; |
| | --$j; |
| | print " - moving to $i $j\n" if $Verbose; |
| | } |
| | elsif ($pointer == DEL) { |
| | print "DELETION ($$ref_words[$i-1] => 0)" if $Verbose; |
| | |
| | unshift @alignment, [$$ref_words[$i-1], undef]; |
| | ++$delpen; |
| | --$i; |
| | print " - moving to $i $j\n" if $Verbose; |
| | } |
| | elsif ($pointer == MATCH) { |
| | print "MATCH ($$ref_words[$i-1] <=> $$hyp_words[$j-1])" if $Verbose; |
| | |
| | unshift @alignment, [$$ref_words[$i-1], $$hyp_words[$j-1]]; |
| | ++$match; |
| | --$j; |
| | --$i; |
| | print " - moving to $i $j\n" if $Verbose; |
| | } |
| | elsif ($pointer == SUBST) { |
| | print "SUBSTITUTION ($$ref_words[$i-1] <=> $$hyp_words[$j-1])" if $Verbose; |
| | |
| | unshift @alignment, [$$ref_words[$i-1], $$hyp_words[$j-1]]; |
| | ++$substpen; |
| | --$j; |
| | --$i; |
| | print " - moving to $i $j\n" if $Verbose; |
| | } |
| | else { |
| | last; |
| | } |
| | } |
| |
|
| | return (\@alignment, $inspen, $delpen, $substpen, $match); |
| | } |
| |
|
| | __END__ |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|