| | #!/bin/bash |
| |
|
| | |
| | |
| | |
| | |
| | |
| |
|
| | |
| | SRC=en |
| |
|
| | |
| | TRG=de |
| |
|
| | |
| | |
| | bpe_operations=89500 |
| |
|
| | |
| | mosesdecoder=/path/to/mosesdecoder |
| |
|
| | |
| | subword_nmt=/path/to/subword-nmt |
| |
|
| | |
| | nematus=/path/to/nematus |
| |
|
| | |
| | for prefix in corpus newstest2013 |
| | do |
| | cut -f 2 data/$prefix.conll.$SRC | \ |
| | awk -v RS="" '{$1=$1}7' | \ |
| | $mosesdecoder/scripts/tokenizer/escape-special-chars.perl -l $SRC > data/$prefix.tok.$SRC |
| |
|
| | cat data/$prefix.$TRG | \ |
| | $mosesdecoder/scripts/tokenizer/normalize-punctuation.perl -l $TRG | \ |
| | $mosesdecoder/scripts/tokenizer/tokenizer.perl -a -l $TRG > data/$prefix.tok.$TRG |
| |
|
| | done |
| |
|
| | |
| | $mosesdecoder/scripts/recaser/train-truecaser.perl -corpus data/corpus.tok.$SRC -model model/truecase-model.$SRC |
| | $mosesdecoder/scripts/recaser/train-truecaser.perl -corpus data/corpus.tok.$TRG -model model/truecase-model.$TRG |
| |
|
| | |
| | for prefix in corpus |
| | do |
| | $mosesdecoder/scripts/recaser/truecase.perl -model model/truecase-model.$SRC < data/$prefix.tok.$SRC > data/$prefix.tc.$SRC |
| | $mosesdecoder/scripts/recaser/truecase.perl -model model/truecase-model.$TRG < data/$prefix.tok.$TRG > data/$prefix.tc.$TRG |
| | done |
| |
|
| | |
| | for prefix in newstest2013 |
| | do |
| | $mosesdecoder/scripts/recaser/truecase.perl -model model/truecase-model.$SRC < data/$prefix.tok.$SRC > data/$prefix.tc.$SRC |
| | $mosesdecoder/scripts/recaser/truecase.perl -model model/truecase-model.$TRG < data/$prefix.tok.$TRG > data/$prefix.tc.$TRG |
| | done |
| |
|
| | |
| | cat data/corpus.tc.$SRC data/corpus.tc.$TRG | $subword_nmt/learn_bpe.py -s $bpe_operations > model/$SRC$TRG.bpe |
| |
|
| | |
| |
|
| | for prefix in corpus newstest2013 |
| | do |
| | $subword_nmt/apply_bpe.py -c model/$SRC$TRG.bpe < data/$prefix.tc.$SRC > data/$prefix.bpe.$SRC |
| | $subword_nmt/apply_bpe.py -c model/$SRC$TRG.bpe < data/$prefix.tc.$TRG > data/$prefix.bpe.$TRG |
| | done |
| |
|
| | |
| |
|
| | for prefix in corpus newstest2013 |
| | do |
| | ../preprocess/conll_to_factors.py data/$prefix.bpe.$SRC data/$prefix.conll.$SRC > data/$prefix.factors.$SRC |
| | done |
| | |
| | |
| | $nematus/data/build_dictionary.py data/corpus.bpe.$SRC data/corpus.bpe.$TRG |
| |
|
| | |
| | for i in {1..4} |
| | do |
| | $mosesdecoder/scripts/training/reduce-factors.perl --corpus data/corpus.factors.$SRC --reduced-corpus data/corpus.factors.$i.$SRC --factor $i |
| | $nematus/data/build_dictionary.py data/corpus.factors.$i.$SRC |
| | done |
| |
|