File size: 7,926 Bytes
fd49381 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 |
#!/usr/bin/env perl
#
# This file is part of moses. Its use is licensed under the GNU Lesser General
# Public License version 2.1 or, at your option, any later version.
# $Id$
use warnings;
use strict;
use FindBin qw($Bin);
use Getopt::Long "GetOptions";
binmode(STDIN, ":utf8");
binmode(STDOUT, ":utf8");
# apply switches
my ($DIR,$CORPUS,$SCRIPTS_ROOT_DIR,$CONFIG,$HELP,$ERROR);
my $LM = "KENLM"; # KENLM is default.
my $BUILD_LM = "build-lm.sh";
my $BUILD_KENLM = "$Bin/../../bin/lmplz";
my $BUILD_BINARY = "$Bin/../../bin/build_binary";
my $EXTRACT = "$Bin/../../bin/extract";
my $SCORE = "$Bin/../../bin/score";
my $CONSOLIDATE_DIRECT = "$Bin/../../bin/consolidate-direct";
my $NGRAM_COUNT = "ngram-count";
my $TRAIN_SCRIPT = "$Bin/../training/train-model.perl";
my $MAX_LEN = 1;
my $FIRST_STEP = 1;
my $LAST_STEP = 11;
$ERROR = "training Aborted."
unless &GetOptions('first-step=i' => \$FIRST_STEP,
'last-step=i' => \$LAST_STEP,
'corpus=s' => \$CORPUS,
'config=s' => \$CONFIG,
'dir=s' => \$DIR,
'ngram-count=s' => \$NGRAM_COUNT,
'build-lm=s' => \$BUILD_LM,
'build-kenlm=s' => \$BUILD_KENLM,
'lm=s' => \$LM,
'train-script=s' => \$TRAIN_SCRIPT,
'scripts-root-dir=s' => \$SCRIPTS_ROOT_DIR,
'max-len=i' => \$MAX_LEN,
'help' => \$HELP);
# check and set default to unset parameters
$ERROR = "please specify working dir --dir" unless defined($DIR) || defined($HELP);
$ERROR = "please specify --corpus" if !defined($CORPUS) && !defined($HELP)
&& $FIRST_STEP <= 2 && $LAST_STEP >= 1;
if ($HELP || $ERROR) {
if ($ERROR) {
print STDERR "ERROR: " . $ERROR . "\n";
}
print STDERR "Usage: $0 --dir /output/recaser --corpus /Cased/corpus/files [options ...]";
print STDERR "\n\nOptions:
== MANDATORY ==
--dir=dir ... outputted recaser directory.
--corpus=file ... inputted cased corpus.
== OPTIONAL ==
= Recaser Training configuration =
--train-script=file ... path to the train script (default: train-factored-phrase-model.perl in \$PATH).
--config=config ... training script configuration.
--scripts-root-dir=dir ... scripts directory.
--max-len=int ... max phrase length (default: 1).
= Language Model Training configuration =
--lm=[IRSTLM,SRILM,KENLM] ... language model (default: KENLM).
--build-lm=file ... path to build-lm.sh if not in \$PATH (used only with --lm=IRSTLM).
--ngram-count=file ... path to ngram-count.sh if not in \$PATH (used only with --lm=SRILM).
= Steps this script will perform =
(1) Truecasing;
(2) Language Model Training;
(3) Data Preparation
(4-10) Recaser Model Training;
(11) Cleanup.
--first-step=[1-11] ... step where script starts (default: 1).
--last-step=[1-11] ... step where script ends (default: 11).
--help ... this usage output.\n";
if ($ERROR) {
exit(1);
}
else {
exit(0);
}
}
# main loop
`mkdir -p $DIR`;
&truecase() if $FIRST_STEP == 1;
$CORPUS = "$DIR/aligned.truecased" if (-e "$DIR/aligned.truecased");
&train_lm() if $FIRST_STEP <= 2;
&prepare_data() if $FIRST_STEP <= 3 && $LAST_STEP >= 3;
&train_recase_model() if $FIRST_STEP <= 10 && $LAST_STEP >= 3;
&cleanup() if $LAST_STEP == 11;
exit(0);
### subs ###
sub truecase {
print STDERR "(1) Truecase data @ ".`date`;
print STDERR "(1) To build model without truecasing, use --first-step 2, and make sure $DIR/aligned.truecased does not exist\n";
my $cmd = "$Bin/train-truecaser.perl --model $DIR/truecaser_model --corpus $CORPUS";
print STDERR $cmd."\n";
system($cmd) == 0 || die("Training truecaser died with error " . ($? >> 8) . "\n");
$cmd = "$Bin/truecase.perl --model $DIR/truecaser_model < $CORPUS > $DIR/aligned.truecased";
print STDERR $cmd."\n";
system($cmd) == 0 || die("Applying truecaser died with error " . ($? >> 8) . "\n");
}
sub train_lm {
print STDERR "(2) Train language model on cased data @ ".`date`;
my $cmd = "";
if (uc $LM eq "IRSTLM") {
$cmd = "$BUILD_LM -t /tmp -i $CORPUS -n 3 -o $DIR/cased.irstlm.gz";
}
elsif (uc $LM eq "SRILM") {
$LM = "SRILM";
$cmd = "$NGRAM_COUNT -text $CORPUS -lm $DIR/cased.srilm.gz -interpolate -kndiscount";
}
else {
$LM = "KENLM";
$cmd = "$BUILD_KENLM --prune 0 0 1 -S 5% -T $DIR/lmtmp --order 3 --text $CORPUS --arpa $DIR/cased.kenlm.arpa.gz";
}
print STDERR "** Using $LM **" . "\n";
print STDERR $cmd."\n";
system($cmd) == 0 || die("Language model training failed with error " . ($? >> 8) . "\n");
if ($LM eq "KENLM") {
system("$BUILD_BINARY $DIR/cased.kenlm.arpa.gz $DIR/cased.kenlm ; rm $DIR/cased.kenlm.arpa.gz");
}
}
sub prepare_data {
print STDERR "\n(3) Preparing data for training recasing model @ ".`date`;
open(CORPUS,$CORPUS);
binmode(CORPUS, ":utf8");
open(CASED,">$DIR/aligned.cased");
binmode(CASED, ":utf8");
print "$DIR/aligned.lowercased\n";
open(LOWERCASED,">$DIR/aligned.lowercased");
binmode(LOWERCASED, ":utf8");
open(ALIGNMENT,">$DIR/aligned.a");
while(<CORPUS>) {
next if length($_)>2000;
s/\x{0}//g;
s/\|//g;
s/ +/ /g;
s/^ //;
s/ [\r\n]*$/\n/;
next if /^$/;
print CASED $_;
print LOWERCASED lc($_);
my $i=0;
foreach (split) {
print ALIGNMENT "$i-$i ";
$i++;
}
print ALIGNMENT "\n";
}
close(CORPUS);
close(CASED);
close(LOWERCASED);
close(ALIGNMENT);
}
sub train_recase_model {
print STDERR "\n(4) Training recasing model @ ".`date`;
my $first = $FIRST_STEP;
$first = 4 if $first < 4;
if ($MAX_LEN == 1) {
my $cmd = "$EXTRACT $DIR/aligned.cased $DIR/aligned.lowercased $DIR/aligned.a $DIR/extract 1";
system($cmd) == 0 || die("ERROR: extract (special case max-len 1) failed: $cmd");
$cmd = "sort -S 2G $DIR/extract > $DIR/extract.sorted";
system($cmd) == 0 || die("ERROR: sort extract (special case max-len 1) failed: $cmd");
$cmd = "$SCORE $DIR/extract.sorted /dev/null $DIR/phrase-table-half --NoLex";
system($cmd) == 0 || die("ERROR: score (special case max-len 1) failed: $cmd");
$cmd = "$CONSOLIDATE_DIRECT $DIR/phrase-table-half $DIR/phrase-table";
system($cmd) == 0 || die("ERROR: consolidate-direct (special case max-len 1) failed: $cmd");
system("rm $DIR/phrase-table-half");
system("gzip $DIR/phrase-table");
$first = 9;
}
my $cmd = "$TRAIN_SCRIPT --root-dir $DIR --model-dir $DIR --first-step $first --alignment a --corpus $DIR/aligned --f lowercased --e cased --max-phrase-length $MAX_LEN";
if ($MAX_LEN == 1) {
$cmd .= " --score-options='--NoLex --OnlyDirect'";
}
else {
$cmd .= " --score-options='--OnlyDirect'";
}
if (uc $LM eq "IRSTLM") {
$cmd .= " --lm 0:3:$DIR/cased.irstlm.gz:1";
}
elsif (uc $LM eq "SRILM") {
$cmd .= " --lm 0:3:$DIR/cased.srilm.gz:8";
}
else {
$cmd .= " --lm 0:3:$DIR/cased.kenlm:8";
}
$cmd .= " -config $CONFIG" if $CONFIG;
print STDERR $cmd."\n";
system($cmd) == 0 || die("Recaser model training failed with error " . ($? >> 8) . "\n");
}
sub cleanup {
print STDERR "\n(11) Cleaning up @ ".`date`;
`rm -f $DIR/extract*`;
my $clean_1 = $?;
`rm -f $DIR/aligned*`;
my $clean_2 = $?;
`rm -f $DIR/lex*`;
my $clean_3 = $?;
`rm -f $DIR/truecaser_model`;
my $clean_4 = $?;
if ($clean_1 + $clean_2 + $clean_3 + $clean_4 != 0) {
print STDERR "Training successful but some files could not be cleaned.\n";
}
}
|