File size: 3,350 Bytes
fd49381 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 |
#!/usr/bin/env perl
#
# This file is part of moses. Its use is licensed under the GNU Lesser General
# Public License version 2.1 or, at your option, any later version.
# $Id: train-recaser.perl 1326 2007-03-26 05:44:27Z bojar $
use warnings;
use strict;
use Getopt::Long "GetOptions";
binmode(STDIN, ":utf8");
binmode(STDOUT, ":utf8");
# apply switches
# ASR input has no case, make sure it is lowercase, and make sure known are cased eg. 'i' to be uppercased even if i is known
my ($MODEL, $UNBUFFERED, $ASR);
die("truecase.perl --model MODEL [-b] [-a] < in > out")
unless &GetOptions('model=s' => \$MODEL,'b|unbuffered' => \$UNBUFFERED, 'a|asr' => \$ASR)
&& defined($MODEL);
if (defined($UNBUFFERED) && $UNBUFFERED) { $|=1; }
my $asr = 0;
if (defined($ASR) && $ASR) { $asr = 1; }
my (%BEST,%KNOWN);
open(MODEL,$MODEL) || die("ERROR: could not open '$MODEL'");
binmode(MODEL, ":utf8");
while(<MODEL>) {
my ($word,@OPTIONS) = split;
$BEST{ lc($word) } = $word;
if ($asr == 0) {
$KNOWN{ $word } = 1;
for(my $i=1;$i<$#OPTIONS;$i+=2) {
$KNOWN{ $OPTIONS[$i] } = 1;
}
}
}
close(MODEL);
my %SENTENCE_END = ("."=>1,":"=>1,"?"=>1,"!"=>1);
my %DELAYED_SENTENCE_START = ("("=>1,"["=>1,"\""=>1,"'"=>1,"'"=>1,"""=>1,"["=>1,"]"=>1);
while(<STDIN>) {
chop;
my ($WORD,$MARKUP) = split_xml($_);
my $sentence_start = 1;
for(my $i=0;$i<=$#$WORD;$i++) {
print " " if $i && $$MARKUP[$i] eq '';
print $$MARKUP[$i];
my ($word,$otherfactors);
if ($$WORD[$i] =~ /^([^\|]+)(.*)/)
{
$word = $1;
$otherfactors = $2;
}
else
{
$word = $$WORD[$i];
$otherfactors = "";
}
if ($asr){
$word = lc($word); #make sure ASR output is not uc
}
if ($sentence_start && defined($BEST{lc($word)})) {
print $BEST{lc($word)}; # truecase sentence start
}
elsif (defined($KNOWN{$word})) {
print $word; # don't change known words
}
elsif (defined($BEST{lc($word)})) {
print $BEST{lc($word)}; # truecase otherwise unknown words
}
else {
print $word; # unknown, nothing to do
}
print $otherfactors;
if ( defined($SENTENCE_END{ $word })) { $sentence_start = 1; }
elsif (!defined($DELAYED_SENTENCE_START{ $word })) { $sentence_start = 0; }
}
print $$MARKUP[$#$MARKUP];
print "\n";
}
# store away xml markup
sub split_xml {
my ($line) = @_;
my (@WORD,@MARKUP);
my $i = 0;
$MARKUP[0] = "";
while($line =~ /\S/) {
# XML tag
if ($line =~ /^\s*(<\S[^>]*>)(.*)$/) {
my $potential_xml = $1;
my $line_next = $2;
# exception for factor that is an XML tag
if ($line =~ /^\S/ && scalar(@WORD)>0 && $WORD[$i-1] =~ /\|$/) {
$WORD[$i-1] .= $potential_xml;
if ($line_next =~ /^(\|+)(.*)$/) {
$WORD[$i-1] .= $1;
$line_next = $2;
}
}
else {
$MARKUP[$i] .= $potential_xml." ";
}
$line = $line_next;
}
# non-XML text
elsif ($line =~ /^\s*([^\s<>]+)(.*)$/) {
$WORD[$i++] = $1;
$MARKUP[$i] = "";
$line = $2;
}
# '<' or '>' occurs in word, but it's not an XML tag
elsif ($line =~ /^\s*(\S+)(.*)$/) {
$WORD[$i++] = $1;
$MARKUP[$i] = "";
$line = $2;
}
else {
die("ERROR: huh? $line\n");
}
}
chop($MARKUP[$#MARKUP]);
return (\@WORD,\@MARKUP);
}
|