| | |
| | |
| | |
| | |
| |
|
| | use warnings; |
| | use strict; |
| |
|
| | use utf8; |
| | use Getopt::Long "GetOptions"; |
| | use FindBin qw($RealBin); |
| | use IO::Handle; |
| | use File::Basename; |
| | binmode(STDIN, ':utf8'); |
| | binmode(STDOUT, ':utf8'); |
| | binmode(STDERR, ':utf8'); |
| |
|
| | my $OUT_DIR = "/tmp/Transliteration-Phrase-Table.$$"; |
| |
|
| | my ($MOSES_SRC_DIR,$TRANSLIT_MODEL,$OOV_FILE,$EXTERNAL_BIN_DIR, $INPUT_EXTENSION, $OUTPUT_EXTENSION); |
| | die("ERROR: wrong syntax when invoking train-transliteration-PT.pl") |
| | unless &GetOptions('moses-src-dir=s' => \$MOSES_SRC_DIR, |
| | 'external-bin-dir=s' => \$EXTERNAL_BIN_DIR, |
| | 'transliteration-model-dir=s' => \$TRANSLIT_MODEL, |
| | 'input-extension=s' => \$INPUT_EXTENSION, |
| | 'output-extension=s' => \$OUTPUT_EXTENSION, |
| | 'out-dir=s' => \$OUT_DIR, |
| | 'oov-file=s' => \$OOV_FILE); |
| |
|
| | |
| | die("ERROR: you need to define --moses-src-dir --external-bin-dir, --transliteration-model-dir, --oov-file, --input-extension, --output-extension") |
| | unless (defined($MOSES_SRC_DIR) && |
| | defined($TRANSLIT_MODEL) && |
| | defined($OOV_FILE) && |
| | defined($INPUT_EXTENSION)&& |
| | defined($OUTPUT_EXTENSION)); |
| |
|
| | die("ERROR: could not find Transliteration Model '$TRANSLIT_MODEL'") |
| | unless -e $TRANSLIT_MODEL; |
| | die("ERROR: could not find OOV file '$OOV_FILE'") |
| | unless -e $OOV_FILE; |
| |
|
| | my $UNK_FILE_NAME = basename($OOV_FILE); |
| | `mkdir -p $OUT_DIR/$UNK_FILE_NAME/training`; |
| | `cp $OOV_FILE $OUT_DIR/$UNK_FILE_NAME/$UNK_FILE_NAME`; |
| |
|
| | my $translitFile = "$OUT_DIR/" . $UNK_FILE_NAME . "/" . $UNK_FILE_NAME . ".translit"; |
| |
|
| | print STDERR "Preparing for Transliteration\n"; |
| | prepare_for_transliteration ($OOV_FILE , $translitFile); |
| | print STDERR "Run Transliteration\n"; |
| | run_transliteration ($MOSES_SRC_DIR , $EXTERNAL_BIN_DIR , $TRANSLIT_MODEL , $translitFile); |
| | print STDERR "Form Transliteration Corpus\n"; |
| | form_corpus ($translitFile , $translitFile.".op.nBest" , $OUT_DIR); |
| |
|
| |
|
| | |
| |
|
| | sub prepare_for_transliteration |
| | { |
| | my @list = @_; |
| | my $testFile = $list[0]; |
| | my $translitFile = $list[1]; |
| | my %UNK; |
| | my @words; |
| | my $src; |
| | open MYFILE, "<:encoding(UTF-8)", $testFile or die "Can't open $testFile: $!\n"; |
| |
|
| | while (<MYFILE>) |
| | { |
| | chomp; |
| | |
| | @words = split(/ /, "$_"); |
| |
|
| | foreach (@words) |
| | { |
| | $UNK{"$_"} = 1; |
| | } |
| | } |
| | close (MYFILE); |
| |
|
| | open MYFILE, ">:encoding(UTF-8)", $translitFile or die "Can't open $translitFile: $!\n"; |
| |
|
| | foreach my $key ( keys %UNK ) |
| | { |
| | $src=join(' ', split('',$key)); |
| | print MYFILE "$src\n"; |
| | } |
| | close (MYFILE); |
| | } |
| |
|
| | |
| |
|
| | sub run_transliteration |
| | { |
| | my @list = @_; |
| | my $MOSES_SRC = $list[0]; |
| | my $EXTERNAL_BIN_DIR = $list[1]; |
| | my $TRANSLIT_MODEL = $list[2]; |
| | my $eval_file = $list[3]; |
| |
|
| | `touch $eval_file.moses.table.ini`; |
| |
|
| | print STDERR "Filter Table\n"; |
| |
|
| | `$MOSES_SRC/scripts/training/train-model.perl \\ |
| | -mgiza -mgiza-cpus 10 -dont-zip -first-step 9 \\ |
| | -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION \\ |
| | -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 \\ |
| | -reordering msd-bidirectional-fe -score-options '--KneserNey' \\ |
| | -phrase-translation-table $TRANSLIT_MODEL/model/phrase-table \\ |
| | -reordering-table $TRANSLIT_MODEL/model/reordering-table \\ |
| | -config $eval_file.moses.table.ini \\ |
| | -lm 0:3:$eval_file.moses.table.ini:8`; |
| |
|
| | `$MOSES_SRC/scripts/training/filter-model-given-input.pl \\ |
| | $eval_file.filtered $eval_file.moses.table.ini $eval_file \\ |
| | -Binarizer "$MOSES_SRC/bin/CreateOnDiskPt 1 1 4 100 2"`; |
| |
|
| | `rm $eval_file.moses.table.ini`; |
| |
|
| | print STDERR "Apply Filter\n"; |
| |
|
| | `$MOSES_SRC/scripts/ems/support/substitute-filtered-tables-and-weights.perl \\ |
| | $eval_file.filtered/moses.ini $TRANSLIT_MODEL/model/moses.ini \\ |
| | $TRANSLIT_MODEL/tuning/moses.tuned.ini $eval_file.filtered.ini`; |
| |
|
| | `$MOSES_SRC/bin/moses \\ |
| | -search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000 \\ |
| | -threads 16 -drop-unknown -distortion-limit 0 \\ |
| | -n-best-list $eval_file.op.nBest 50 \\ |
| | -f $eval_file.filtered.ini \\ |
| | < $eval_file \\ |
| | > $eval_file.op`; |
| |
|
| | } |
| |
|
| | |
| |
|
| |
|
| | sub form_corpus |
| | { |
| |
|
| | my @list = @_; |
| | my $inp_file = $list[0]; |
| | my $testFile = $list[1]; |
| | my $EVAL_DIR = $list[2]; |
| | my %vocab; |
| | my @words; |
| | my $thisStr; |
| |
|
| | my $UNK_FILE_NAME = basename($OOV_FILE); |
| | my $target = $EVAL_DIR . "/$UNK_FILE_NAME/training/corpus.$OUTPUT_EXTENSION"; |
| | my $outFile = "$EVAL_DIR/out.txt"; |
| |
|
| | open MYFILE, "<:encoding(UTF-8)", $testFile or die "Can't open $testFile: $!\n"; |
| | open OUTFILE, ">:encoding(UTF-8)", $outFile or die "Can't open $outFile: $!\n"; |
| |
|
| |
|
| | while (<MYFILE>) |
| | { |
| | chomp; |
| | |
| | @words = split(/ /, "$_"); |
| |
|
| |
|
| | my $i = 2; |
| | my $prob; |
| |
|
| | $thisStr = ""; |
| |
|
| | while ($words[$i] ne "|||") |
| | { |
| | $thisStr = $thisStr . $words[$i]; |
| | $i++; |
| | } |
| |
|
| | $i++; |
| |
|
| | while ($words[$i] ne "|||") |
| | { |
| | $i++; |
| | } |
| |
|
| | $i++; |
| | $prob = $words[$i]; |
| |
|
| | print OUTFILE "$thisStr\t$prob\n"; |
| | } |
| | close (MYFILE); |
| | close (OUTFILE); |
| | } |
| |
|
| |
|