|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
use warnings; |
|
|
use strict; |
|
|
use Getopt::Long "GetOptions"; |
|
|
|
|
|
|
|
|
my ($MODEL,$CORPUS); |
|
|
die("train-truecaser.perl --model truecaser --corpus cased [--possiblyUseFirstToken]") |
|
|
unless &GetOptions('corpus=s' => \$CORPUS, |
|
|
'model=s' => \$MODEL, |
|
|
'possiblyUseFirstToken' => \(my $possiblyUseFirstToken = 0)) |
|
|
&& defined($CORPUS) && defined($MODEL); |
|
|
my %CASING; |
|
|
my %SENTENCE_END = ("."=>1,":"=>1,"?"=>1,"!"=>1); |
|
|
my %DELAYED_SENTENCE_START = ("("=>1,"["=>1,"\""=>1,"'"=>1,"'"=>1,"""=>1,"["=>1,"]"=>1); |
|
|
open(CORPUS,$CORPUS) || die("ERROR: could not open '$CORPUS'"); |
|
|
binmode(CORPUS, ":utf8"); |
|
|
while(<CORPUS>) { |
|
|
chop; |
|
|
my ($WORD,$MARKUP) = split_xml($_); |
|
|
my $start = 0; |
|
|
while($start<=$#$WORD && defined($DELAYED_SENTENCE_START{$$WORD[$start]})) { $start++; } |
|
|
my $firstWordOfSentence = 1; |
|
|
for(my $i=$start;$i<=$#$WORD;$i++) { |
|
|
my $currentWord = $$WORD[$i]; |
|
|
if (! $firstWordOfSentence && defined($SENTENCE_END{$$WORD[$i-1]})) { |
|
|
$firstWordOfSentence = 1; |
|
|
} |
|
|
|
|
|
if ($currentWord !~ /[\p{Ll}\p{Lu}\p{Lt}]/) { |
|
|
|
|
|
$firstWordOfSentence = 0; |
|
|
next; |
|
|
} |
|
|
|
|
|
my $currentWordWeight = 0; |
|
|
if (! $firstWordOfSentence) { |
|
|
$currentWordWeight = 1; |
|
|
} elsif ($possiblyUseFirstToken) { |
|
|
|
|
|
my $firstChar = substr($currentWord, 0, 1); |
|
|
if (lc($firstChar) eq $firstChar) { |
|
|
|
|
|
$currentWordWeight = 1; |
|
|
} elsif (scalar(@$WORD) == 1) { |
|
|
|
|
|
$currentWordWeight = 0.1; |
|
|
} |
|
|
} |
|
|
if ($currentWordWeight > 0) { |
|
|
$CASING{ lc($currentWord) }{ $currentWord } += $currentWordWeight; |
|
|
} |
|
|
|
|
|
$firstWordOfSentence = 0; |
|
|
} |
|
|
} |
|
|
close(CORPUS); |
|
|
|
|
|
open(MODEL,">$MODEL") || die("ERROR: could not create '$MODEL'"); |
|
|
binmode(MODEL, ":utf8"); |
|
|
foreach my $type (keys %CASING) { |
|
|
my ($score,$total,$best) = (-1,0,""); |
|
|
foreach my $word (keys %{$CASING{$type}}) { |
|
|
my $count = $CASING{$type}{$word}; |
|
|
$total += $count; |
|
|
if ($count > $score) { |
|
|
$best = $word; |
|
|
$score = $count; |
|
|
} |
|
|
} |
|
|
print MODEL "$best ($score/$total)"; |
|
|
foreach my $word (keys %{$CASING{$type}}) { |
|
|
print MODEL " $word ($CASING{$type}{$word})" unless $word eq $best; |
|
|
} |
|
|
print MODEL "\n"; |
|
|
} |
|
|
close(MODEL); |
|
|
|
|
|
|
|
|
|
|
|
sub split_xml { |
|
|
my ($line) = @_; |
|
|
my (@WORD,@MARKUP); |
|
|
my $i = 0; |
|
|
$MARKUP[0] = ""; |
|
|
while($line =~ /\S/) { |
|
|
|
|
|
if ($line =~ /^\s*(<\S[^>]*>)(.*)$/) { |
|
|
$MARKUP[$i] .= $1." "; |
|
|
$line = $2; |
|
|
} |
|
|
|
|
|
elsif ($line =~ /^\s*([^\s<>]+)(.*)$/) { |
|
|
$WORD[$i++] = $1; |
|
|
$MARKUP[$i] = ""; |
|
|
$line = $2; |
|
|
} |
|
|
|
|
|
elsif ($line =~ /^\s*(\S+)(.*)$/) { |
|
|
$WORD[$i++] = $1; |
|
|
$MARKUP[$i] = ""; |
|
|
$line = $2; |
|
|
} |
|
|
else { |
|
|
die("ERROR: huh? $line\n"); |
|
|
} |
|
|
} |
|
|
chop($MARKUP[$#MARKUP]); |
|
|
return (\@WORD,\@MARKUP); |
|
|
} |
|
|
|